In [1]:
import os
import glob
import json
import pandas as pd

In [2]:
def load_gpu_util(dlprof_summary_file):
    with open(dlprof_summary_file) as json_file:
        summary = json.load(json_file)
    gpu_util_raw = summary["Summary Report"]
    gpu_util = {
        "sm_util": float(100 - gpu_util_raw["GPU Idle %"][0]),
        "tc_util": float(gpu_util_raw["Tensor Core Kernel Utilization %"][0])
    }
    return gpu_util

def parse_pl_timings(pl_profile_file):
    lines = [line.rstrip("\n") for line in open(pl_profile_file)]
    mean_timings = {}
    for l in lines[7:]:
        if "|" in l:
            l = l.split("|")
            l = [i.strip() for i in l]
            mean_timings[l[0]] = float(l[1])
    return mean_timings

In [3]:
gpu_names = [
    "v100-16gb-300w"
]

compute_types = [
    "amp"
]

model_names = [
    "distilroberta-base",
    "roberta-base",
    "roberta-large"
]

columns = ["gpu", "compute", "model", "seq_len", "batch_size",
           "cpu_time", "forward", "backward", "train_loss",
           "vram_usage", "vram_io",
           "sm_util", "tc_util", ]
rows = []

cpu_time_sections = ["get_train_batch", "on_batch_start", "on_train_batch_start",
                     "training_step_end", "on_after_backward",
                     "on_batch_end", "on_train_batch_end"]

for gn in gpu_names:
    for ct in compute_types:
        for mn in model_names:
            path = "/".join(["./results", gn, ct, mn])+"/*"
            configs = glob.glob(path)
            configs.sort(reverse=True)
            for c in configs:
                print(c)
                try:
                    seq_len, batch_size = c.split("/")[-1].split("-")
                    row_1 = [gn, ct, mn, int(seq_len), int(batch_size)]
                    pl_timings = parse_pl_timings(c+"/pl_profile.txt")
                    cpu_time = sum([pl_timings[k] for k in cpu_time_sections])
                    metrics_0 = pd.read_csv(c+"/version_0/metrics.csv")
                    metrics_1 = pd.read_csv(c+"/version_1/metrics.csv")
                    sm_util = metrics_0["gpu_id: 0/utilization.gpu (%)"].mean()
                    vram_usage = metrics_0["gpu_id: 0/memory.used (MB)"].mean()
                    vram_io = metrics_0["gpu_id: 0/utilization.memory (%)"].mean()
                    test_loss = (metrics_0["train_loss"].mean() + metrics_1["train_loss"].mean())/2
                    row_2 = [cpu_time, pl_timings["model_forward"], pl_timings["model_backward"], test_loss]
                    util_data = load_gpu_util(c+"/dlprof_summary.json")
                    sm_util = (sm_util + util_data["sm_util"])/2
                    row_3 = [vram_usage, vram_io, sm_util, util_data["tc_util"]]
                    row = row_1 + row_2 + row_3
                    print(row)
                    rows.append(row)
                except Exception as e:
                    print(e)

./results/v100-16gb-300w/amp/distilroberta-base/512-8
['v100-16gb-300w', 'amp', 'distilroberta-base', 512, 8, 0.0008473449999999999, 0.017489, 0.050107, 0.378791481256485, 5736.666666666667, 34.166666666666664, 39.235975243182, 31.211033692406563]
./results/v100-16gb-300w/amp/distilroberta-base/512-64
'get_train_batch'
./results/v100-16gb-300w/amp/distilroberta-base/512-32
['v100-16gb-300w', 'amp', 'distilroberta-base', 512, 32, 0.0011597369999999999, 0.019586, 0.18686, 0.3064666837453842, 13488.666666666666, 47.416666666666664, 61.14098091247685, 32.58048828871203]
./results/v100-16gb-300w/amp/distilroberta-base/512-256
'get_train_batch'
./results/v100-16gb-300w/amp/distilroberta-base/512-128
'get_train_batch'
./results/v100-16gb-300w/amp/distilroberta-base/512-1
['v100-16gb-300w', 'amp', 'distilroberta-base', 512, 1, 0.0008578929999999999, 0.019573, 0.032949, 0.6610750257968903, 3258.6666666666665, 7.416666666666667, 11.284143158097827, 25.483861828371932]
./results/v100-16gb-300w/am

In [4]:
df = pd.DataFrame(rows, columns=columns) 
df.head(20)

Unnamed: 0,gpu,compute,model,seq_len,batch_size,cpu_time,forward,backward,train_loss,vram_usage,vram_io,sm_util,tc_util
0,v100-16gb-300w,amp,distilroberta-base,512,8,0.000847,0.017489,0.050107,0.378791,5736.666667,34.166667,39.235975,31.211034
1,v100-16gb-300w,amp,distilroberta-base,512,32,0.00116,0.019586,0.18686,0.306467,13488.666667,47.416667,61.140981,32.580488
2,v100-16gb-300w,amp,distilroberta-base,512,1,0.000858,0.019573,0.032949,0.661075,3258.666667,7.416667,11.284143,25.483862
3,v100-16gb-300w,amp,distilroberta-base,256,8,0.000831,0.017823,0.034236,0.41059,3870.666667,15.75,20.680509,32.38327
4,v100-16gb-300w,amp,distilroberta-base,256,64,0.001395,0.019972,0.15003,0.336512,10194.666667,53.75,66.405715,37.347834
5,v100-16gb-300w,amp,distilroberta-base,256,32,0.001139,0.017538,0.077182,0.335163,6488.666667,41.166667,51.082347,36.402965
6,v100-16gb-300w,amp,distilroberta-base,256,1,0.000822,0.018844,0.031686,0.66158,3206.666667,5.666667,8.424434,19.2482
7,v100-16gb-300w,amp,distilroberta-base,128,8,0.000861,0.016974,0.032592,0.461515,3288.666667,8.5,14.233912,30.342406
8,v100-16gb-300w,amp,distilroberta-base,128,64,0.001346,0.018552,0.068529,0.386783,5690.666667,38.333333,49.178726,39.350682
9,v100-16gb-300w,amp,distilroberta-base,128,32,0.001094,0.019062,0.036678,0.397646,4376.666667,21.25,28.310474,37.157099


In [5]:
df.to_csv("./results.csv")