# SageMaker Debugger Profiling Report

SageMaker Debugger auto generated this report. You can generate similar reports on all supported training jobs. The report provides summary of training job, system resource usage statistics, framework metrics, rules summary, and detailed analysis from each rule. The graphs and tables are interactive. 

**Legal disclaimer:** This report and any recommendations are provided for informational purposes only and are not definitive. You are responsible for making your own independent assessment of the information.


In [1]:
import json
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
import datetime
from smdebug.profiler.utils import us_since_epoch_to_human_readable_time, ns_since_epoch_to_human_readable_time
from smdebug.core.utils import setup_profiler_report


[2022-01-15 23:18:01.418 ip-10-0-126-20.ec2.internal:758 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: /opt/ml/processing/input/profiler/signals/ProfilerReport


In [2]:
import bokeh
from bokeh.io import output_notebook, show
from bokeh.layouts import column, row
from bokeh.plotting import figure
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn
from bokeh.models import ColumnDataSource, PreText
from math import pi
from bokeh.transform import cumsum
import warnings
from bokeh.models.widgets import Paragraph
from bokeh.models import Legend
from bokeh.util.warnings import BokehDeprecationWarning, BokehUserWarning
warnings.simplefilter('ignore', BokehDeprecationWarning)
warnings.simplefilter('ignore', BokehUserWarning)

output_notebook(hide_banner=True)

In [3]:
processing_job_arn = ""

In [4]:
# Parameters
processing_job_arn = "arn:aws:sagemaker:us-east-1:709614815312:processing-job/dogimageestimator-2022-01--profilerreport-f23164c1"


In [5]:
setup_profiler_report(processing_job_arn)

In [6]:
def create_piechart(data_dict, title=None, height=400, width=400, x1=0, x2=0.1, radius=0.4, toolbar_location='right'):
   
    plot = figure(plot_height=height, 
                  plot_width=width,
                  toolbar_location=toolbar_location,
                  tools="hover,wheel_zoom,reset,pan", 
                  tooltips="@phase:@value", 
                  title=title,
                  x_range=(-radius-x1, radius+x2))

    data = pd.Series(data_dict).reset_index(name='value').rename(columns={'index':'phase'})
    data['angle'] = data['value']/data['value'].sum() * 2*pi
    data['color'] = bokeh.palettes.viridis(len(data_dict))

    plot.wedge(x=0, y=0., radius=radius,
        start_angle=cumsum('angle', include_zero=True), 
        end_angle=cumsum('angle'),
        line_color="white", 
        source=data, 
        fill_color='color', 
        legend='phase'
              )
    plot.legend.label_text_font_size = "8pt"
    plot.legend.location = 'center_right'
    plot.axis.axis_label=None
    plot.axis.visible=False
    plot.grid.grid_line_color = None
    plot.outline_line_color = "white"
    
    return plot

In [7]:
from IPython.display import display, HTML, Markdown, Image
def pretty_print(df):
    raw_html = df.to_html().replace("\\n","<br>").replace('<tr>','<tr style="text-align: left;">')
    return display(HTML(raw_html))

## Training job summary

In [8]:
def load_report(rule_name):
    try:
        report = json.load(open('/opt/ml/processing/output/rule/profiler-output/profiler-reports/'+rule_name+'.json'))
        return report
    except FileNotFoundError:
        print (rule_name + ' not triggered')

In [9]:

job_statistics = {}
report = load_report('MaxInitializationTime')
if report:
    if "first" in report['Details']["step_num"] and "last" in report['Details']["step_num"]:
        first_step = report['Details']["step_num"]["first"]
        last_step = report['Details']["step_num"]["last"]
    tmp = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)
    date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')
    day = date.date().strftime("%m/%d/%Y")
    hour = date.time().strftime("%H:%M:%S")
    job_statistics["Start time"] = f"{hour} {day}"
    tmp = us_since_epoch_to_human_readable_time(report['Details']['job_end'] * 1000000)
    date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')
    day = date.date().strftime("%m/%d/%Y")
    hour = date.time().strftime("%H:%M:%S")
    job_statistics["End time"] = f"{hour} {day}"
    job_duration_in_seconds = int(report['Details']['job_end'] - report['Details']['job_start']) 
    job_statistics["Job duration"] = f"{job_duration_in_seconds} seconds"
    if "first" in report['Details']["step_num"] and "last" in report['Details']["step_num"]:
        tmp = us_since_epoch_to_human_readable_time(first_step)
        date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')
        day = date.date().strftime("%m/%d/%Y")
        hour = date.time().strftime("%H:%M:%S")
        job_statistics["Training loop start"] = f"{hour} {day}"
        tmp = us_since_epoch_to_human_readable_time(last_step)
        date = datetime.datetime.strptime(tmp, '%Y-%m-%dT%H:%M:%S:%f')
        day = date.date().strftime("%m/%d/%Y")
        hour = date.time().strftime("%H:%M:%S")
        job_statistics["Training loop end"] = f"{hour} {day}"
        training_loop_duration_in_seconds = int((last_step - first_step) / 1000000)
        job_statistics["Training loop duration"] = f"{training_loop_duration_in_seconds} seconds"
        initialization_in_seconds = int(first_step/1000000 - report['Details']['job_start'])
        job_statistics["Initialization time"] = f"{initialization_in_seconds} seconds"
        finalization_in_seconds = int(np.abs(report['Details']['job_end'] - last_step/1000000))
        job_statistics["Finalization time"] = f"{finalization_in_seconds} seconds"
        initialization_perc = int(initialization_in_seconds / job_duration_in_seconds * 100)
        job_statistics["Initialization"] = f"{initialization_perc} %"
        training_loop_perc = int(training_loop_duration_in_seconds / job_duration_in_seconds * 100)
        job_statistics["Training loop"] = f"{training_loop_perc} %"
        finalization_perc = int(finalization_in_seconds / job_duration_in_seconds * 100)
        job_statistics["Finalization"] = f"{finalization_perc} %"

In [10]:
if report:
    text =  """The following table gives a summary about the training job. The table includes information about when the training job started and ended, how much time initialization, training loop and finalization took."""
    if len(job_statistics) > 0:
        df = pd.DataFrame.from_dict(job_statistics, orient='index')
        start_time = us_since_epoch_to_human_readable_time(report['Details']['job_start'] * 1000000)
        date = datetime.datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%S:%f')
        day = date.date().strftime("%m/%d/%Y")
        hour = date.time().strftime("%H:%M:%S")
        duration = job_duration_in_seconds
        text = f"""{text} \n Your training job started on {day} at {hour} and ran for {duration} seconds."""

        #pretty_print(df)
        if "first" in report['Details']["step_num"] and "last" in report['Details']["step_num"]:
            if finalization_perc  < 0:
                job_statistics["Finalization%"]  = 0
            if training_loop_perc < 0:
                job_statistics["Training loop"] = 0
            if initialization_perc < 0:
                job_statistics["Initialization"] = 0
        else:
            text = f"""{text} \n Your training job started on {day} at {hour} and ran for {duration} seconds."""
            
    if len(job_statistics) > 0:
        df2 = df.reset_index()
        df2.columns = ["0", "1"]
        source = ColumnDataSource(data=df2)
        columns = [TableColumn(field='0', title=""),
                   TableColumn(field='1', title="Job Statistics"),]
        table = DataTable(source=source, columns=columns, width=450, height=380)

    plot = None

    if "Initialization" in job_statistics:
        piechart_data = {}
        piechart_data["Initialization"] = initialization_perc  
        piechart_data["Training loop"]  = training_loop_perc
        piechart_data["Finalization"]  = finalization_perc 

        plot = create_piechart(piechart_data, 
                               height=350,
                               width=500,
                               x1=0.15,
                               x2=0.15,
                               radius=0.15, 
                               toolbar_location=None)

    if plot != None:
        paragraph = Paragraph(text=f"""{text}""", width = 800)
        show(column(paragraph, row(table, plot)))
    else:
        paragraph = Paragraph(text=f"""{text}. No step information was profiled from your training job. The time spent on initialization and finalization cannot be computed.""" , width = 800)
        show(column(paragraph, row(table)))

## System usage statistics

In [11]:
report = load_report('OverallSystemUsage')

In [12]:
text1 = ''
if report:
    if "GPU" in report["Details"]:
        for node_id in report["Details"]["GPU"]:
            gpu_p95 = report["Details"]["GPU"][node_id]["p95"]
            gpu_p50 = report["Details"]["GPU"][node_id]["p50"]
            cpu_p95 = report["Details"]["CPU"][node_id]["p95"]
            cpu_p50 = report["Details"]["CPU"][node_id]["p50"]
            
            if gpu_p95 < 70 and cpu_p95 < 70:
                text1 = f"""{text1}The 95th percentile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. 
                The 95th percentile of the total CPU utilization is only {int(cpu_p95)}%. Node {node_id} is underutilized. 
                You may want to consider switching to a smaller instance type."""
            elif gpu_p95 < 70 and cpu_p95 > 70:
                text1 = f"""{text1}The 95th percentile of the total GPU utilization on node {node_id} is only {int(gpu_p95)}%. 
                However, the 95th percentile of the total CPU utilization is {int(cpu_p95)}%. GPUs on node {node_id} are underutilized, 
                likely because of CPU bottlenecks."""
            elif gpu_p50 > 70:
                text1 = f"""{text1}The median total GPU utilization on node {node_id} is {int(gpu_p50)}%. 
                GPUs on node {node_id} are well utilized."""
            else:
                text1 = f"""{text1}The median total GPU utilization on node {node_id} is {int(gpu_p50)}%. 
                The median total CPU utilization is {int(cpu_p50)}%."""
    else:
        for node_id in report["Details"]["CPU"]:
            cpu_p95 = report["Details"]["CPU"][node_id]["p95"]
            if cpu_p95 > 70:
                text1 = f"""{text1}The 95th percentile of the total CPU utilization on node {node_id} is {int**(cpu_p95)}%. CPUs on node {node_id} are well utilized."""
    text1 = Paragraph(text=f"""{text1}""", width=1100)
    text2 = Paragraph(text=f"""The following table shows statistics of resource utilization per worker (node), 
    such as the total CPU and GPU utilization, and the memory utilization on CPU and GPU. 
    The table also includes the total I/O wait time and the total amount of data sent or received in bytes.
    The table shows min and max values as well as p99, p90 and p50 percentiles.""", width=900)


In [13]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
rows = [] 
units = {"CPU": "percentage", "CPU memory": "percentage", "GPU": "percentage", "Network": "bytes", "GPU memory": "percentage", "I/O": "percentage"}
if report:
    for metric in report['Details']:
        for node_id in report['Details'][metric]:
            values = report['Details'][metric][node_id]
            rows.append([node_id, metric, units[metric], values['max'], values['p99'], values['p95'], values['p50'], values['min']])

    df = pd.DataFrame(rows) 
    df.columns = ['Node', 'metric', 'unit', 'max', 'p99', 'p95', 'p50', 'min']
    df2 = df.reset_index()
    source = ColumnDataSource(data=df2)
    columns = [TableColumn(field='Node', title="node"),
               TableColumn(field='metric', title="metric"),
               TableColumn(field='unit', title="unit"),
               TableColumn(field='max', title="max"),
               TableColumn(field='p99', title="p99"),
               TableColumn(field='p95', title="p95"),
               TableColumn(field='p50', title="p50"),
               TableColumn(field='min', title="min"),]
    table = DataTable(source=source, columns=columns, width=800, height=df2.shape[0]*30)

    show(column( text1, text2, row(table)))

In [14]:
report = load_report('OverallFrameworkMetrics')
if report:
    if 'Details' in report:

        display(Markdown(f"""## Framework metrics summary"""))
        plots = []
        text = ''
        if 'phase' in report['Details']:
            text = f"""The following two pie charts show the time spent on the TRAIN phase, the EVAL phase, 
            and others. The 'others' includes the time spent between steps (after one step has finished and before
            the next step has started). Ideally, most of the training time should be spent on the 
            TRAIN and EVAL phases. If TRAIN/EVAL were not specified in the training script, steps will be recorded as 
            GLOBAL."""

            if 'others' in report['Details']['phase']:
                others = float(report['Details']['phase']['others'])

                if others > 25:
                    text = f"""{text} Your training job spent quite a significant amount of time ({round(others,2)}%) in phase "others".
                    You should check what is happening in between the steps."""

                plot = create_piechart(report['Details']['phase'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="The ratio between the time spent on the TRAIN/EVAL phase and others")
                plots.append(plot)

        if 'forward_backward' in report['Details']:

            event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)
            perc = report['Details']['forward_backward'][event]

            text = f"""{text} The pie chart on the right shows a more detailed breakdown. 
            It shows that {int(perc)}% of the time was spent in event "{event}"."""

            if perc > 70:
                text = f"""There is quite a significant difference between the time spent on forward and backward
                pass."""
            else:
                text = f"""{text} It shows that {int(perc)}% of the training time
                was spent on "{event}"."""

            plot = create_piechart(report['Details']['forward_backward'], 
                                height=350,
                                width=600,
                                x1=0.2,
                                x2=0.6,
                                radius=0.3, 
                                title="The ratio between forward and backward pass") 
            plots.append(plot)

        if len(plots) > 0:
            paragraph = Paragraph(text=text, width=1100)
            show(column(paragraph, row(plots)))

        plots = []
        text=''
        if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:

            key = list(report['Details']['ratio'].keys())[0]
            ratio = report['Details']['ratio'][key]

            text = f"""The following piechart shows a breakdown of the CPU/GPU operators. 
                It shows that {int(ratio)}% of training time was spent on executing the "{key}" operator."""

            plot = create_piechart(report['Details']['ratio'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="The ratio between the time spent on CPU/GPU operators")
            plots.append(plot)


        if 'general' in report['Details']:
            event = max(report['Details']['general'], key=report['Details']['general'].get)
            perc = report['Details']['general'][event]

            plot = create_piechart(report['Details']['general'], 
                                height=350,
                                width=600,
                                x1=0.2,
                                x2=0.6,
                                radius=0.3, 
                                title="General framework operations")
            plots.append(plot)

        if len(plots) > 0:
            paragraph = Paragraph(text=text, width=1100)
            show(column(paragraph, row(plots)))

        plots = []
        text = ''
        if 'horovod' in report['Details']:
            display(Markdown(f"""#### Overview: Horovod metrics"""))
            event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)
            perc = report['Details']['horovod'][event]
            text = f"""{text} The following pie chart shows a detailed breakdown of the Horovod metrics profiled
            from your training job. The most expensive function was "{event}" with {int(perc)}%."""

            plot = create_piechart(report['Details']['horovod'], 
                                height=350,
                                width=600,
                                x1=0.2,
                                x2=0.6,
                                radius=0.3, 
                                title="Horovod metrics ")

            paragraph = Paragraph(text=text, width=1100)
            show(column(paragraph, row(plot)))


## Framework metrics summary

In [15]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
rows = [] 
values = []
if report:
    if 'CPU_total' in report['Details']:
        display(Markdown(f"""#### Overview: CPU operators"""))
        event = max(report['Details']['CPU'], key=report['Details']['CPU'].get)
        perc = report['Details']['CPU'][event]

        for function in report['Details']['CPU']:
            percentage = round(report['Details']['CPU'][function],2)
            time = report['Details']['CPU_total'][function]               
            rows.append([percentage, time, function])

        df = pd.DataFrame(rows) 
        df.columns = ['percentage', 'time', 'operator']

        df = df.sort_values(by=['percentage'], ascending=False)
        source = ColumnDataSource(data=df)
        columns = [TableColumn(field='percentage', title="Percentage"),
                   TableColumn(field='time', title="Cumulative time in microseconds"),
                  TableColumn(field='operator', title="CPU operator"),]

        table = DataTable(source=source, columns=columns, width=550, height=350)

        text = Paragraph(text=f"""The following table shows a list of operators that ran on the CPUs.
        The most expensive operator on the CPUs was "{event}" with {int(perc)} %.""")

        plot = create_piechart(report['Details']['CPU'],
                                height=350,
                                width=600,
                                x1=0.2,
                                x2=0.6,
                                radius=0.3, 
                               )

        show(column(text, row(table, plot)))


#### Overview: CPU operators

In [16]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
rows = [] 
values = []
if report:
    if 'GPU_total' in report['Details']:
        display(Markdown(f"""#### Overview: GPU operators"""))
        event = max(report['Details']['GPU'], key=report['Details']['GPU'].get)
        perc = report['Details']['GPU'][event]

        for function in report['Details']['GPU']:
            percentage = round(report['Details']['GPU'][function],2)
            time = report['Details']['GPU_total'][function]               
            rows.append([percentage, time, function])

        df = pd.DataFrame(rows) 
        df.columns = ['percentage', 'time', 'operator']

        df = df.sort_values(by=['percentage'], ascending=False)
        source = ColumnDataSource(data=df)
        columns = [TableColumn(field='percentage', title="Percentage"),
                   TableColumn(field='time', title="Cumulative time in microseconds"),
                  TableColumn(field='operator', title="GPU operator"),]
        table = DataTable(source=source, columns=columns, width=450, height=350)

        text = Paragraph(text=f"""The following table shows a list of operators that your training job ran on GPU.
        The most expensive operator on GPU was "{event}" with {int(perc)} %""")

        plot = create_piechart(report['Details']['GPU'],
                                height=350,
                                width=600,
                                x1=0.2,
                                x2=0.6,
                                radius=0.3, 
                               )

        show(column(text, row(table, plot)))

## Rules summary

In [17]:
description = {}
description['CPUBottleneck'] = 'Checks if the CPU utilization is high and the GPU utilization is low. \
It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive \
from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue \
if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.'
description['IOBottleneck'] =  'Checks if the data I/O wait time is high and the GPU utilization is low. \
It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. \
The rule evaluates the I/O and GPU utilization rates and triggers the issue \
if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.'
description['Dataloader'] = 'Checks how many data loaders are running in parallel and whether the total number is equal the number \
of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. \
If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.'
description['GPUMemoryIncrease'] = 'Measures the average GPU memory footprint and triggers if there is a large increase.'
description['BatchSize'] = 'Checks if GPUs are underutilized because the batch size is too small. \
To detect this problem, the rule analyzes the average GPU memory footprint, \
the CPU and the GPU utilization. '
description['LowGPUUtilization'] = 'Checks if the GPU utilization is low or fluctuating. \
This can happen due to bottlenecks, blocking calls for synchronizations, \
or a small batch size.'
description['MaxInitializationTime'] = 'Checks if the time spent on initialization exceeds a threshold percent of the total training time. \
The rule waits until the first step of training loop starts. The initialization can take longer \
if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.'
description['LoadBalancing'] = 'Detects workload balancing issues across GPUs. \
Workload imbalance can occur in training jobs with data parallelism. \
The gradients are accumulated on a primary GPU, and this GPU might be overused \
with regard to other GPUs, resulting in reducing the efficiency of data parallelization.'
description['StepOutlier'] = 'Detects outliers in step duration. The step duration for forward and backward pass should be \
roughly the same throughout the training. If there are significant outliers, \
it may indicate a system stall or bottleneck issues.'

In [18]:
recommendation = {}
recommendation['CPUBottleneck'] = 'Consider increasing the number of data loaders \
or applying data pre-fetching.'
recommendation['IOBottleneck'] = 'Pre-fetch data or choose different file formats, such as binary formats that \
improve I/O performance.'
recommendation['Dataloader'] = 'Change the number of data loader processes.'
recommendation['GPUMemoryIncrease'] = 'Choose a larger instance type with more memory if footprint is close to maximum available memory.'
recommendation['BatchSize'] = 'The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.'
recommendation['LowGPUUtilization'] = 'Check if there are bottlenecks, minimize blocking calls, \
change distributed training strategy, or increase the batch size.'
recommendation['MaxInitializationTime'] = 'Initialization takes too long. \
If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.'
recommendation['LoadBalancing'] = 'Choose a different distributed training strategy or \
a different distributed training framework.'
recommendation['StepOutlier'] = 'Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.'

In [19]:
files = glob.glob('/opt/ml/processing/output/rule/profiler-output/profiler-reports/*json')
summary = {}
for i in files:
    rule_name = i.split('/')[-1].replace('.json','')
    if rule_name == "OverallSystemUsage" or rule_name == "OverallFrameworkMetrics":
        continue
    rule_report = json.load(open(i))
    summary[rule_name] = {}
    summary[rule_name]['Description'] = description[rule_name]
    summary[rule_name]['Recommendation'] = recommendation[rule_name]
    summary[rule_name]['Number of times rule triggered'] = rule_report['RuleTriggered'] 
    #summary[rule_name]['Number of violations'] = rule_report['Violations'] 
    summary[rule_name]['Number of datapoints'] = rule_report['Datapoints']
    summary[rule_name]['Rule parameters'] = rule_report['RuleParameters']

df = pd.DataFrame.from_dict(summary, orient='index')
df = df.sort_values(by=['Number of times rule triggered'], ascending=False)


display(Markdown(f"""The following table shows a profiling summary of the Debugger built-in rules. 
The table is sorted by the rules that triggered the most frequently. During your training job, the {df.index[0]} rule
was the most frequently triggered. It processed {df.values[0,3]} datapoints and was triggered {df.values[0,2]} times."""))

with pd.option_context('display.colheader_justify','left'):    
    pretty_print(df)

The following table shows a profiling summary of the Debugger built-in rules. 
The table is sorted by the rules that triggered the most frequently. During your training job, the StepOutlier rule
was the most frequently triggered. It processed 1155 datapoints and was triggered 9 times.

Unnamed: 0,Description,Recommendation,Number of times rule triggered,Number of datapoints,Rule parameters
StepOutlier,"Detects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues.","Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.",9,1155,threshold:3 mode:None n_outliers:10 stddev:3
IOBottleneck,Checks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.,"Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance.",0,1640,threshold:50 io_threshold:50 gpu_threshold:10 patience:1000
MaxInitializationTime,Checks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.,"Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.",0,1155,threshold:20
BatchSize,"Checks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization.","The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.",0,1633,cpu_threshold_p95:70 gpu_threshold_p95:70 gpu_memory_threshold_p95:70 patience:1000 window:500
LoadBalancing,"Detects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization.",Choose a different distributed training strategy or a different distributed training framework.,0,0,threshold:0.2 patience:1000
CPUBottleneck,"Checks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.",Consider increasing the number of data loaders or applying data pre-fetching.,0,1640,threshold:50 cpu_threshold:90 gpu_threshold:10 patience:1000
GPUMemoryIncrease,Measures the average GPU memory footprint and triggers if there is a large increase.,Choose a larger instance type with more memory if footprint is close to maximum available memory.,0,0,increase:5 patience:1000 window:10
Dataloader,"Checks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.",Change the number of data loader processes.,0,1,min_threshold:70 max_threshold:200
LowGPUUtilization,"Checks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size.","Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size.",0,0,threshold_p95:70 threshold_p5:10 window:500 patience:1000


In [20]:
analyse_phase = "training"
if job_statistics and "initialization_in_seconds" in job_statistics:
    if job_statistics["initialization_in_seconds"] > job_statistics["training_loop_duration_in_seconds"]:
        analyse_phase = "initialization"
        time = job_statistics["initialization_in_seconds"]
        perc = job_statistics["initialization_%"]
        display(Markdown(f"""The initialization phase took {int(time)} seconds, which is {int(perc)}%*
        of the total training time. Since the training loop has taken the most time, 
        we dive deep into the events occurring during this phase"""))
        display(Markdown("""## Analyzing initialization\n\n"""))
    time = job_statistics["training_loop_duration_in_seconds"]
    perc = job_statistics["training_loop_%"]
    display(Markdown(f"""The training loop lasted for {int(time)} seconds which is {int(perc)}% of the training job time.
                    Since the training loop has taken the most time, we dive deep into the events occured during this phase."""))
if analyse_phase == 'training':
    display(Markdown("""## Analyzing the training loop\n\n"""))

## Analyzing the training loop



In [21]:
if analyse_phase == "initialization":
    display(Markdown("""### MaxInitializationTime\n\nThis rule helps to detect if the training initialization is taking too much time. \nThe rule waits until first step is available. The rule takes the parameter `threshold` that defines how many minutes to wait for the first step to become available. Default is 20 minutes.\nYou can run the rule locally in the following way:
    """))
    
    _ = load_report("MaxInitializationTime")

In [22]:
if analyse_phase == "training":
    display(Markdown("""### Step duration analysis"""))
    report = load_report('StepOutlier')
    if report:
        parameters = report['RuleParameters']
        params = report['RuleParameters'].split('\n')
        stddev = params[3].split(':')[1]
        mode = params[1].split(':')[1]
        n_outlier = params[2].split(':')[1]
        triggered = report['RuleTriggered']
        datapoints = report['Datapoints']

        text = f"""The StepOutlier rule measures step durations and checks for outliers. The rule 
        returns True if duration is larger than {stddev} times the standard deviation. The rule 
        also takes the parameter mode, that specifies whether steps from training or validation phase 
        should be checked. In your processing job mode was specified as {mode}. 
        Typically the first step is taking significantly more time and to avoid the 
        rule triggering immediately, one can use n_outliers to specify the number of outliers to ignore. 
        n_outliers was set to {n_outlier}.
        The rule analysed {datapoints} datapoints and triggered {triggered} times.
        """

        paragraph = Paragraph(text=text, width=900)
        show(column(paragraph))

        if report and len(report['Details']['step_details']) > 0:
            for node_id in report['Details']['step_details']:
                tmp = report['RuleParameters'].split('threshold:')
                threshold = tmp[1].split('\n')[0]
                n_outliers = report['Details']['step_details'][node_id]['number_of_outliers']
                mean = report['Details']['step_details'][node_id]['step_stats']['mean']
                stddev = report['Details']['step_details'][node_id]['stddev']
                phase = report['Details']['step_details'][node_id]['phase']
                display(Markdown(f"""**Step durations on node {node_id}:**"""))
                display(Markdown(f"""The following table is a summary of the statistics of step durations measured on node {node_id}.
                The rule has analyzed the step duration from {phase} phase.
                The average step duration on node {node_id} was {round(mean, 2)}s. 
                The rule detected {n_outliers} outliers, where step duration was larger than {threshold} times the standard deviation of {stddev}s
                                 \n"""))
                step_stats_df = pd.DataFrame.from_dict(report['Details']['step_details'][node_id]['step_stats'], orient='index').T
                step_stats_df.index = ['Step Durations in [s]']
                pretty_print(step_stats_df)

            display(Markdown(f"""The following histogram shows the step durations measured on the different nodes. 
                You can turn on or turn off the visualization of histograms by selecting or unselecting the labels in the legend."""))

            plot = figure(plot_height=450, 
                              plot_width=850, 
                              title=f"""Step durations""")  

            colors = bokeh.palettes.viridis(len(report['Details']['step_details']))

            for index, node_id in enumerate(report['Details']['step_details']):
                probs = report['Details']['step_details'][node_id]['probs']
                binedges = report['Details']['step_details'][node_id]['binedges']

                plot.quad( top=probs,
                        bottom=0,
                        left=binedges[:-1],
                        right=binedges[1:],
                        line_color="white",
                        fill_color=colors[index],
                        fill_alpha=0.7,
                        legend=node_id)

            plot.add_layout(Legend(), 'right')    
            plot.y_range.start = 0
            plot.xaxis.axis_label = f"""Step durations in [s]"""
            plot.yaxis.axis_label = "Occurrences"
            plot.grid.grid_line_color = "white"
            plot.legend.click_policy="hide"
            plot.legend.location = 'center_right'
            show(plot)

        if report['RuleTriggered'] > 0:

            text=f"""To get a better understanding of what may have caused those outliers,
            we correlate the timestamps of step outliers with other framework metrics that happened at the same time.
            The left chart shows how much time was spent in the different framework
            metrics aggregated by event phase. The chart on the right shows the histogram of normal step durations (without
            outliers). The following chart shows how much time was spent in the different 
            framework metrics when step outliers occurred. In this chart framework metrics are not aggregated byphase."""
            plots = []
            if 'phase' in report['Details']:
                text = f"""{text} The chart (in the middle) shows whether step outliers mainly happened during TRAIN or EVAL phase.
                """

                plot = create_piechart(report['Details']['phase'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="The ratio between the time spent on the TRAIN/EVAL phase")
                plots.append(plot)

            if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:

                event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)
                perc = report['Details']['forward_backward'][event]

                text = f"""{text} The pie chart on the right shows a detailed breakdown. 
                It shows that {int(perc)}% of the training time was spent on event "{event}"."""

                plot = create_piechart(report['Details']['forward_backward'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="The Ratio between forward and backward pass") 
                plots.append(plot)

            if len(plots) > 0:
                paragraph = Paragraph(text=text, width=900)
                show(column(paragraph, row(plots)))

            plots = []
            text = ""
            if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:

                key = list(report['Details']['ratio'].keys())[0]
                ratio = report['Details']['ratio'][key]

                text = f"""The following pie chart shows a breakdown of the CPU/GPU operators executed during the step outliers. 
                    It shows that {int(ratio)}% of the training time was spent on executing operators in "{key}"."""

                plot = create_piechart(report['Details']['ratio'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="The ratio between CPU/GPU operators")
                plots.append(plot)


            if 'general' in report['Details'] and len(report['Details']['general']) > 0:

                event = max(report['Details']['general'], key=report['Details']['general'].get)
                perc = report['Details']['general'][event]

                plot = create_piechart(report['Details']['general'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="General metrics recorded in framework ")
                plots.append(plot)

            if len(plots) > 0:
                paragraph = Paragraph(text=text, width=900)
                show(column(paragraph, row(plots)))

            plots = []
            text = ""
            if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:

                event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)
                perc = report['Details']['horovod'][event]
                text = f"""The following pie chart shows a detailed breakdown of the Horovod metrics that have been
                recorded when step outliers happened. The most expensive function was {event} with {int(perc)}%"""

                plot = create_piechart(report['Details']['horovod'], 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="General metrics recorded in framework ")

                paragraph = Paragraph(text=text, width=900)
                show(column(paragraph, row(plot)))      

### Step duration analysis

**Step durations on node algo-1-26:**

The following table is a summary of the statistics of step durations measured on node algo-1-26.
                The rule has analyzed the step duration from Step:ModeKeys.EVAL phase.
                The average step duration on node algo-1-26 was 0.87s. 
                The rule detected 0 outliers, where step duration was larger than 3 times the standard deviation of 0.17s
                                 


Unnamed: 0,mean,max,p99,p95,p50,min
Step Durations in [s],0.87,1.24,1.09,1.02,0.89,0.08


The following histogram shows the step durations measured on the different nodes. 
                You can turn on or turn off the visualization of histograms by selecting or unselecting the labels in the legend.

In [23]:
if analyse_phase == "training":
    display(Markdown("""### GPU utilization analysis\n\n"""))
    display(Markdown("""**Usage per GPU** \n\n"""))
    report = load_report('LowGPUUtilization')
    if report:
        params = report['RuleParameters'].split('\n')
        threshold_p95 = params[0].split(':')[1]
        threshold_p5 = params[1].split(':')[1]
        window = params[2].split(':')[1]
        patience = params[3].split(':')[1]
        violations = report['Violations']
        triggered = report['RuleTriggered']
        datapoints = report['Datapoints']
        
        text=Paragraph(text=f"""The LowGPUUtilization rule checks for a low and fluctuating GPU usage. If the GPU usage is 
        consistently low, it might be caused by bottlenecks or a small batch size. If usage is heavily 
        fluctuating, it can be due to bottlenecks or blocking calls. The rule computed the 95th and 5th 
        percentile of GPU utilization on {window} continuous datapoints and found {violations} cases where 
        p95 was above {threshold_p95}% and p5 was below {threshold_p5}%. If p95 is high and p5 is low,
        it might indicate that the GPU usage is highly fluctuating. If both values are very low, 
        it would mean that the machine is underutilized. During initialization, the GPU usage is likely zero, 
        so the rule skipped the first {patience} data points.
        The rule analysed {datapoints} datapoints and triggered {triggered} times.""", width=800)
        show(text)

        
        if len(report['Details']) > 0:
            
            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])
            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')
            day = date.date().strftime("%m/%d/%Y")
            hour = date.time().strftime("%H:%M:%S")
            text = Paragraph(text=f"""Your training job is underutilizing the instance. You may want to consider
            to either switch to a smaller instance type or to increase the batch size. 
            The last time that the LowGPUUtilization rule was triggered in your training job was on {day} at {hour}.
            The following boxplots are a snapshot from the timestamps. 
            They show the utilization per GPU (without outliers).
            To get a better understanding of the workloads throughout the whole training,
            you can check the workload histogram in the next section.""", width=800)
            show(text)
            
            del report['Details']['last_timestamp']
            
            for node_id in report['Details']:
                
                plot = figure(plot_height=350, 
                          plot_width=1000,
                          toolbar_location='right',
                          tools="hover,wheel_zoom,reset,pan", 
                          title=f"Node {node_id}",
                          x_range=(0,17),
                          )
                
                for index, key in enumerate(report['Details'][node_id]):
                    display(Markdown(f"""**GPU utilization of {key} on node {node_id}:**"""))
                    text = ""
                    gpu_max = report['Details'][node_id][key]['gpu_max']
                    p_95 = report['Details'][node_id][key]['gpu_95']
                    p_5 = report['Details'][node_id][key]['gpu_5']
                    text = f"""{text} The max utilization of {key} on node {node_id} was {gpu_max}%"""
                    if p_95 < int(threshold_p95): 
                        text = f"""{text} and the 95th percentile was only {p_95}%. 
                        {key} on node {node_id} is underutilized"""
                    if p_5 < int(threshold_p5): 
                        text = f"""{text} and the 5th percentile was only {p_5}%"""
                    if p_95 - p_5 > 50:
                        text = f"""{text} The difference between 5th percentile {p_5}% and 95th percentile {p_95}% is quite 
                        significant, which means that utilization on {key} is fluctuating quite a lot.\n"""
     
                    upper = report['Details'][node_id][key]['upper']
                    lower = report['Details'][node_id][key]['lower']
                    p75 = report['Details'][node_id][key]['p75']
                    p25 = report['Details'][node_id][key]['p25']
                    p50 = report['Details'][node_id][key]['p50']

                    plot.segment(index+1, upper, index+1, p75, line_color="black")
                    plot.segment(index+1, lower, index+1, p25, line_color="black")

                    plot.vbar(index+1, 0.7, p50, p75, fill_color="#FDE725", line_color="black")
                    plot.vbar(index+1, 0.7, p25, p50, fill_color="#440154", line_color="black")

                    plot.rect(index+1, lower, 0.2, 0.01, line_color="black")
                    plot.rect(index+1, upper, 0.2, 0.01, line_color="black")

                    plot.xaxis.major_label_overrides[index+1] = key
                    plot.xgrid.grid_line_color = None
                    plot.ygrid.grid_line_color = "white"
                    plot.grid.grid_line_width = 0

                    plot.xaxis.major_label_text_font_size="10px"
                    text=Paragraph(text=f"""{text}""", width=900)
                    show(text)
                plot.yaxis.axis_label = "Utilization in %"
                plot.xaxis.ticker = np.arange(index+2)
                
                show(plot)

### GPU utilization analysis



**Usage per GPU** 



In [24]:
 
if analyse_phase == "training": 
    display(Markdown("""**Workload balancing**\n\n""")) 
    report = load_report('LoadBalancing')
    if report:
        params = report['RuleParameters'].split('\n')
        threshold = params[0].split(':')[1]
        patience = params[1].split(':')[1]
        triggered = report['RuleTriggered']
        datapoints = report['Datapoints']
    
        paragraph = Paragraph(text=f"""The LoadBalancing rule helps to detect issues in workload balancing 
        between multiple GPUs. 
        It computes a histogram of GPU utilization values for each GPU and compares then the 
        similarity between histograms. The rule checked if the distance of histograms is larger than the 
        threshold of {threshold}.
        During initialization utilization is likely zero, so the rule skipped the first {patience} data points.
        """, width=900)
        show(paragraph)
        
        if len(report['Details']) > 0:
            for node_id in report['Details']: 
                
                
                text = f"""The following histogram shows the workload per GPU on node {node_id}. 
                You can enable/disable the visualization of a workload by clicking on the label in the legend.
                """
                if len(report['Details']) == 1 and len(report['Details'][node_id]['workloads']) == 1:
                    text = f"""{text} Your training job only used one GPU so there is no workload balancing issue."""
                
                plot = figure(plot_height=450, 
                              plot_width=850, 
                              x_range=(-1,100),
                              title=f"""Workloads on node {node_id}""")
                
                colors = bokeh.palettes.viridis(len(report['Details'][node_id]['workloads']))
                
                for index, gpu_id2 in enumerate(report['Details'][node_id]['workloads']):
                    probs = report['Details'][node_id]['workloads'][gpu_id2]
                    plot.quad( top=probs,
                                bottom=0,
                                left=np.arange(0,98,2),
                                right=np.arange(2,100,2),
                                line_color="white",
                                fill_color=colors[index],
                                fill_alpha=0.8,
                                legend=gpu_id2 )

                    plot.y_range.start = 0
                    plot.xaxis.axis_label = f"""Utilization"""
                    plot.yaxis.axis_label = "Occurrences"
                    plot.grid.grid_line_color = "white"
                    plot.legend.click_policy="hide"
                
                paragraph = Paragraph(text=text)
                show(column(paragraph, plot))
                
                if "distances" in report['Details'][node_id]:
                    text = f"""The rule identified workload balancing issues on node {node_id} 
                    where workloads differed by more than threshold {threshold}. 
                    """
                    for index, gpu_id2 in enumerate(report['Details'][node_id]['distances']):
                        for gpu_id1 in report['Details'][node_id]['distances'][gpu_id2]:
                            distance = round(report['Details'][node_id]['distances'][gpu_id2][gpu_id1], 2)
                            text = f"""{text} The difference of workload between {gpu_id2} and {gpu_id1} is: {distance}."""

                    paragraph = Paragraph(text=f"""{text}""", width=900)
                    show(column(paragraph))

**Workload balancing**



In [25]:
if analyse_phase == "training":
    display(Markdown("""### Dataloading analysis\n\n"""))
    report = load_report('Dataloader')
    if report:
        params = report['RuleParameters'].split("\n")
        min_threshold = params[0].split(':')[1]
        max_threshold = params[1].split(':')[1]
        triggered = report['RuleTriggered']
        datapoints = report['Datapoints']
    
        text=f"""The number of dataloader workers can greatly affect the overall performance 
        of your training job. The rule analyzed the number of dataloading processes that have been running in 
        parallel on the training instance and compares it against the total number of cores. 
        The rule checked if the number of processes is smaller than {min_threshold}% or larger than 
        {max_threshold}% the total number of cores. Having too few dataloader workers can slowdown data preprocessing and lead to GPU 
        underutilization. Having too many dataloader workers may hurt the
        overall performance if you are running other compute intensive tasks on the CPU.
        The rule analysed {datapoints} datapoints and triggered {triggered} times."""
        
        paragraph = Paragraph(text=f"{text}", width=900)
        show(paragraph)
        text = ""
        if 'cores' in report['Details']:
            cores = int(report['Details']['cores'])
            dataloaders = report['Details']['dataloaders']
            if dataloaders < cores: 
                text=f"""{text} Your training instance provided {cores} CPU cores, however your training job only 
                ran on average {dataloaders} dataloader workers in parallel. We recommend you to increase the number of
                dataloader workers."""
            if dataloaders > cores:
                text=f"""{text} Your training instance provided {cores} CPU cores, however your training job ran 
                on average {dataloaders} dataloader workers. We recommed you to decrease the number of dataloader
                workers."""
        if 'pin_memory' in report['Details'] and report['Details']['pin_memory'] == False:
            text=f"""{text} Using pinned memory also improves performance because it enables fast data transfer to CUDA-enabled GPUs.
            The rule detected that your training job was not using pinned memory. 
            In case of using PyTorch Dataloader, you can enable this by setting pin_memory=True."""
            
        if 'prefetch' in report['Details'] and report['Details']['prefetch'] == False:
            text=f"""{text} It appears that your training job did not perform any data pre-fetching. Pre-fetching can improve your
            data input pipeline as it produces the data ahead of time."""
        paragraph = Paragraph(text=f"{text}", width=900)
        show(paragraph)
        
        colors=bokeh.palettes.viridis(10)
        if "dataloading_time" in report['Details']:
            median = round(report['Details']["dataloading_time"]['p50'],4)
            p95 = round(report['Details']["dataloading_time"]['p95'],4)
            p25 = round(report['Details']["dataloading_time"]['p25'],4)
            binedges = report['Details']["dataloading_time"]['binedges']
            probs = report['Details']["dataloading_time"]['probs']
            text=f"""The following histogram shows the distribution of dataloading times that have been measured throughout your training job. The median dataloading time was {median}s. 
            The 95th percentile was {p95}s and the 25th percentile was {p25}s"""

            plot = figure(plot_height=450, 
                              plot_width=850,
                              toolbar_location='right',
                              tools="hover,wheel_zoom,reset,pan",
                              x_range=(binedges[0], binedges[-1])
                              )
            
            plot.quad( top=probs,
                        bottom=0,
                        left=binedges[:-1],
                        right=binedges[1:],
                        line_color="white",
                        fill_color=colors[0],
                        fill_alpha=0.8,
                        legend="Dataloading events" )

            plot.y_range.start = 0
            plot.xaxis.axis_label = f"""Dataloading in [s]"""
            plot.yaxis.axis_label = "Occurrences"
            plot.grid.grid_line_color = "white"
            plot.legend.click_policy="hide"

            paragraph = Paragraph(text=f"{text}", width=900)
            show(column(paragraph, plot))

### Dataloading analysis



In [26]:
if analyse_phase == "training":
    display(Markdown(""" ### Batch size"""))
    report = load_report('BatchSize')
    if report:
        params = report['RuleParameters'].split('\n')
        cpu_threshold_p95 = int(params[0].split(':')[1])
        gpu_threshold_p95 = int(params[1].split(':')[1])
        gpu_memory_threshold_p95 = int(params[2].split(':')[1])
        patience = int(params[3].split(':')[1])
        window = int(params[4].split(':')[1])
        violations = report['Violations']
        triggered = report['RuleTriggered']
        datapoints = report['Datapoints']
        
        text = Paragraph(text=f"""The BatchSize rule helps to detect if GPU is underutilized because of the batch size being 
        too small. To detect this the rule analyzes the GPU memory footprint, CPU and GPU utilization. The rule checked if the 95th percentile of CPU utilization is below cpu_threshold_p95 of 
        {cpu_threshold_p95}%, the 95th percentile of GPU utilization is below gpu_threshold_p95 of {gpu_threshold_p95}% and the 95th percentile of memory footprint \
        below gpu_memory_threshold_p95 of {gpu_memory_threshold_p95}%. In your training job this happened {violations} times. \
        The rule skipped the first {patience} datapoints. The rule computed the percentiles over window size of {window} continuous datapoints.\n
        The rule analysed {datapoints} datapoints and triggered {triggered} times.
        """, width=800)
        show(text)
        if len(report['Details']) >0: 
            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])
            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')
            day = date.date().strftime("%m/%d/%Y")
            hour = date.time().strftime("%H:%M:%S")
            del report['Details']['last_timestamp']
            text = Paragraph(text=f"""Your training job is underutilizing the instance. You may want to consider
            either switch to a smaller instance type or to increase the batch size. 
            The last time the BatchSize rule triggered in your training job was on {day} at {hour}.
            The following boxplots are a snapshot from the timestamps. They the total 
            CPU utilization, the GPU utilization, and the GPU memory usage per GPU (without outliers).""", 
            width=800)
            show(text)

            for node_id in report['Details']:
                xmax = max(20, len(report['Details'][node_id]))
                
                plot = figure(plot_height=350, 
                          plot_width=1000,
                          toolbar_location='right',
                          tools="hover,wheel_zoom,reset,pan", 
                          title=f"Node {node_id}",
                          x_range=(0,xmax)
                          )
                
                for index, key in enumerate(report['Details'][node_id]):
                        upper = report['Details'][node_id][key]['upper']
                        lower = report['Details'][node_id][key]['lower']
                        p75 = report['Details'][node_id][key]['p75']
                        p25 = report['Details'][node_id][key]['p25']
                        p50 = report['Details'][node_id][key]['p50']

                        plot.segment(index+1, upper, index+1, p75, line_color="black")
                        plot.segment(index+1, lower, index+1, p25, line_color="black")

                        plot.vbar(index+1, 0.7, p50, p75, fill_color="#FDE725", line_color="black")
                        plot.vbar(index+1, 0.7, p25, p50, fill_color="#440154", line_color="black")

                        plot.rect(index+1, lower, 0.2, 0.01, line_color="black")
                        plot.rect(index+1, upper, 0.2, 0.01, line_color="black")

                        plot.xaxis.major_label_overrides[index+1] = key
                        plot.xgrid.grid_line_color = None
                        plot.ygrid.grid_line_color = "white"
                        plot.grid.grid_line_width = 0

                        plot.xaxis.major_label_text_font_size="10px"
                plot.xaxis.ticker = np.arange(index+2)
                plot.yaxis.axis_label = "Utilization in %"
                show(plot)

 ### Batch size

In [27]:
if analyse_phase == "training": 
    display(Markdown("""### CPU bottlenecks\n\n"""))

    report = load_report('CPUBottleneck')
    if report:
        params = report['RuleParameters'].split('\n')
        threshold = int(params[0].split(':')[1])
        cpu_threshold = int(params[1].split(':')[1])
        gpu_threshold = int(params[2].split(':')[1])
        patience = int(params[3].split(':')[1])
        violations = report['Violations']
        triggered = report['RuleTriggered']
        datapoints = report['Datapoints']
        
        if report['Violations'] > 0:
            perc = int(report['Violations']/report['Datapoints']*100)
        else:
            perc = 0
        if perc < threshold:
            string = 'below'
        else:
            string = 'above'
        text = f"""The CPUBottleneck rule checked when the CPU utilization was above cpu_threshold of {cpu_threshold}% 
        and GPU utilization was below gpu_threshold of {gpu_threshold}%. 
        During initialization utilization is likely to be zero, so the rule skipped the first {patience} datapoints.
        With this configuration the rule found {violations} CPU bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%
        The rule analysed {datapoints} data points and triggered {triggered} times."""
        
        paragraph = Paragraph(text=text, width=900)
        show(paragraph)
        if report:

            plots = []
            text = ""
            if report['RuleTriggered'] > 0:

                low_gpu = report['Details']['low_gpu_utilization']
                cpu_bottleneck = {}
                cpu_bottleneck["GPU usage above threshold"] = report["Datapoints"] - report["Details"]["low_gpu_utilization"]
                cpu_bottleneck["GPU usage below threshold"] = report["Details"]["low_gpu_utilization"] - len(report["Details"])
                cpu_bottleneck["Low GPU usage due to CPU bottlenecks"] = len(report["Details"]["bottlenecks"])

                n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)
                text = f"""The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%
                and how many of those datapoints were likely caused by a CPU bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization 
                below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by CPU bottlenecks. 
                """

                plot = create_piechart(cpu_bottleneck, 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="Low GPU usage caused by CPU bottlenecks")

                plots.append(plot)

                if 'phase' in report['Details']:
                    text = f"""{text} The chart (in the middle) shows whether CPU bottlenecks mainly 
                    happened during train/validation phase.
                    """

                    plot = create_piechart(report['Details']['phase'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="The ratio between time spent on TRAIN/EVAL phase")
                    plots.append(plot)

                if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:

                    event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)
                    perc = report['Details']['forward_backward'][event]

                    text = f"""{text} The pie charts on the right shows a more detailed breakdown. 
                    It shows that {int(perc)}% of the training time was spent on event {event}"""

                    plot = create_piechart(report['Details']['forward_backward'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="The ratio between forward and backward pass") 
                    plots.append(plot)

                if len(plots) > 0:
                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plots)))

                plots = []
                text = ""
                if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:

                    key = list(report['Details']['ratio'].keys())[0]
                    ratio = report['Details']['ratio'][key]

                    text = f"""The following pie chart shows a breakdown of the CPU/GPU operators that happened during CPU bottlenecks. 
                        It shows that {int(ratio)}% of the training time was spent on executing operators in "{key}"."""

                    plot = create_piechart(report['Details']['ratio'], 
                                            height=350,
                                            width=600,
                                            x1=0.2,
                                            x2=0.6,
                                            radius=0.3, 
                                            title="The ratio between CPU/GPU operators")
                    plots.append(plot)


                if 'general' in report['Details'] and len(report['Details']['general']) > 0:

                    event = max(report['Details']['general'], key=report['Details']['general'].get)
                    perc = report['Details']['general'][event]
                
                    plot = create_piechart(report['Details']['general'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="General metrics recorded in framework ")
                    plots.append(plot)

                if len(plots) > 0:
                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plots)))

                plots = []
                text = ""
                if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:

                    event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)
                    perc = report['Details']['horovod'][event]
                    text = f"""The following pie chart shows a detailed breakdown of the Horovod metrics 
                    that have been recorded when the CPU bottleneck happened. The most expensive function was 
                    {event} with {int(perc)}%"""

                    plot = create_piechart(report['Details']['horovod'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="General metrics recorded in framework ")

                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plot)))

### CPU bottlenecks



In [28]:
if analyse_phase == "training": 
    display(Markdown("""### I/O bottlenecks\n\n"""))

    report = load_report('IOBottleneck')
    if report:
        params = report['RuleParameters'].split('\n')
        threshold = int(params[0].split(':')[1])
        io_threshold = int(params[1].split(':')[1])
        gpu_threshold = int(params[2].split(':')[1])
        patience = int(params[3].split(':')[1])
        violations = report['Violations']
        triggered = report['RuleTriggered']
        datapoints = report['Datapoints']
    
        if report['Violations'] > 0:
            perc = int(report['Violations']/report['Datapoints']*100)
        else:
            perc = 0
        if perc < threshold:
            string = 'below'
        else:
            string = 'above'
        text = f"""The IOBottleneck rule checked when I/O wait time was above io_threshold of {io_threshold}% 
        and GPU utilization was below gpu_threshold of {gpu_threshold}. During initialization utilization is likely to be zero, so the rule skipped the first {patience} datapoints. 
        With this configuration the rule found {violations} I/O bottlenecks which is {perc}% of the total time. This is {string} the threshold of {threshold}%.
        The rule analysed {datapoints} datapoints and triggered {triggered} times."""
        paragraph = Paragraph(text=text, width=900)
        show(paragraph)
        
        if report:

            plots = []
            text = ""
            if report['RuleTriggered'] > 0:

                low_gpu = report['Details']['low_gpu_utilization']
                cpu_bottleneck = {}
                cpu_bottleneck["GPU usage above threshold"] = report["Datapoints"] - report["Details"]["low_gpu_utilization"]
                cpu_bottleneck["GPU usage below threshold"] = report["Details"]["low_gpu_utilization"] - len(report["Details"])
                cpu_bottleneck["Low GPU usage due to I/O bottlenecks"] = len(report["Details"]["bottlenecks"])

                n_bottlenecks = round(len(report['Details']['bottlenecks'])/datapoints * 100, 2)
                text = f"""The following chart (left) shows how many datapoints were below the gpu_threshold of {gpu_threshold}%
                and how many of those datapoints were likely caused by a I/O bottleneck. The rule found {low_gpu} out of {datapoints} datapoints which had a GPU utilization 
                below {gpu_threshold}%. Out of those datapoints {n_bottlenecks}% were likely caused by I/O bottlenecks. 
                """

                plot = create_piechart(cpu_bottleneck, 
                                    height=350,
                                    width=600,
                                    x1=0.2,
                                    x2=0.6,
                                    radius=0.3, 
                                    title="Low GPU usage caused by I/O bottlenecks")

                plots.append(plot)

                if 'phase' in report['Details']:
                    text = f"""{text} The chart (in the middle) shows whether I/O bottlenecks mainly happened during the training or validation phase.
                    """

                    plot = create_piechart(report['Details']['phase'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="The ratio between the time spent on the TRAIN/EVAL phase")
                    plots.append(plot)

                if 'forward_backward' in report['Details'] and  len(report['Details']['forward_backward']) > 0:

                    event = max(report['Details']['forward_backward'], key=report['Details']['forward_backward'].get)
                    perc = report['Details']['forward_backward'][event]

                    text = f"""{text} The pie charts on the right shows a more detailed breakdown. 
                    It shows that {int(perc)}% of the training time was spent on event "{event}"."""

                    plot = create_piechart(report['Details']['forward_backward'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="The ratio between forward and backward pass") 
                    plots.append(plot)

                if len(plots) > 0:
                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plots)))

                plots = []
                text = ""
                if 'ratio' in report['Details'] and len(report['Details']['ratio']) > 0:

                    key = list(report['Details']['ratio'].keys())[0]
                    ratio = report['Details']['ratio'][key]

                    text = f"""The following pie chart shows a breakdown of the CPU/GPU operators that happened 
                    during I/O bottlenecks. It shows that {int(ratio)}% of the training time was spent on executing operators in "{key}"."""

                    plot = create_piechart(report['Details']['ratio'], 
                                            height=350,
                                            width=600,
                                            x1=0.2,
                                            x2=0.6,
                                            radius=0.3, 
                                            title="Ratio between CPU/GPU operators")
                    plots.append(plot)


                if 'general' in report['Details'] and len(report['Details']['general']) > 0:

                    event = max(report['Details']['general'], key=report['Details']['general'].get)
                    perc = report['Details']['general'][event]

                    plot = create_piechart(report['Details']['general'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="General metrics recorded in framework ")
                    plots.append(plot)

                if len(plots) > 0:
                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plots)))

                plots = []
                text = ""
                if 'horovod' in report['Details'] and len(report['Details']['horovod']) > 0:

                    event = max(report['Details']['horovod'], key=report['Details']['horovod'].get)
                    perc = report['Details']['horovod'][event]
                    text = f"""The following pie chart shows a detailed breakdown of the Horovod metrics that have been
                    recorded when I/O bottleneck happened. The most expensive function was {event} with {int(perc)}%"""

                    plot = create_piechart(report['Details']['horovod'], 
                                        height=350,
                                        width=600,
                                        x1=0.2,
                                        x2=0.6,
                                        radius=0.3, 
                                        title="General metrics recorded in framework ")

                    paragraph = Paragraph(text=text, width=900)
                    show(column(paragraph, row(plot)))    


### I/O bottlenecks



In [29]:
if analyse_phase == "training":
    display(Markdown("""### GPU memory\n\n"""))
    
    report = load_report('GPUMemoryIncrease')
    if report:
        params = report['RuleParameters'].split('\n')
        increase = float(params[0].split(':')[1])
        patience = params[1].split(':')[1]
        window = params[2].split(':')[1]
        violations = report['Violations']
        triggered = report['RuleTriggered']
        datapoints = report['Datapoints']
    
        text=Paragraph(text=f"""The GPUMemoryIncrease rule helps to detect large increase in memory usage on GPUs. 
        The rule checked if the moving average of memory increased by more than {increase}%. 
        So if the moving average increased for instance from 10% to {11+increase}%, 
        the rule would have triggered. During initialization utilization  is likely 0, so the rule skipped the first {patience} datapoints.
        The moving average was computed on a window size of {window} continuous datapoints. The rule detected {violations} violations
        where the moving average between previous and current time window increased by more than {increase}%.
        The rule analysed {datapoints} datapoints and triggered {triggered} times.""",
                       width=900)
        show(text)

        if len(report['Details']) > 0:
            
            timestamp = us_since_epoch_to_human_readable_time(report['Details']['last_timestamp'])
            date = datetime.datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S:%f')
            day = date.date().strftime("%m/%d/%Y")
            hour = date.time().strftime("%H:%M:%S")
            text = Paragraph(text=f"""Your training job triggered memory spikes. 
            The last time the GPUMemoryIncrease rule triggered in your training job was on {day} at {hour}.
            The following boxplots are a snapshot from the timestamps. They show for each node and GPU the corresponding
            memory utilization (without outliers).""", width=900)
            show(text)
            
            del report['Details']['last_timestamp']
            
            for node_id in report['Details']:
    
                plot = figure(plot_height=350, 
                          plot_width=1000,
                          toolbar_location='right',
                          tools="hover,wheel_zoom,reset,pan", 
                          title=f"Node {node_id}",
                          x_range=(0,17),
                          )

                for index, key in enumerate(report['Details'][node_id]):
                    display(Markdown(f"""**Memory utilization of {key} on node {node_id}:**"""))
                    text = ""
                    gpu_max = report['Details'][node_id][key]['gpu_max']
                    text = f"""{text} The max memory utilization of {key} on node {node_id} was {gpu_max}%."""
                    
                    p_95 = int(report['Details'][node_id][key]['p95'])
                    p_5 = report['Details'][node_id][key]['p05']
                    if p_95 < int(50): 
                        text = f"""{text} The 95th percentile was only {p_95}%."""
                    if p_5 < int(5): 
                        text = f"""{text} The 5th percentile was only {p_5}%."""
                    if p_95 - p_5 > 50:
                        text = f"""{text} The difference between 5th percentile {p_5}% and 95th percentile {p_95}% is quite 
                        significant, which means that memory utilization on {key} is fluctuating quite a lot."""
                        
                    text = Paragraph(text=f"""{text}""", width=900)
                    show(text)
                    
                    upper = report['Details'][node_id][key]['upper']
                    lower = report['Details'][node_id][key]['lower']
                    p75 = report['Details'][node_id][key]['p75']
                    p25 = report['Details'][node_id][key]['p25']
                    p50 = report['Details'][node_id][key]['p50']

                    plot.segment(index+1, upper, index+1, p75, line_color="black")
                    plot.segment(index+1, lower, index+1, p25, line_color="black")

                    plot.vbar(index+1, 0.7, p50, p75, fill_color="#FDE725", line_color="black")
                    plot.vbar(index+1, 0.7, p25, p50, fill_color="#440154", line_color="black")

                    plot.rect(index+1, lower, 0.2, 0.01, line_color="black")
                    plot.rect(index+1, upper, 0.2, 0.01, line_color="black")

                    plot.xaxis.major_label_overrides[index+1] = key
                    plot.xgrid.grid_line_color = None
                    plot.ygrid.grid_line_color = "white"
                    plot.grid.grid_line_width = 0

                    plot.xaxis.major_label_text_font_size="10px"
                plot.xaxis.ticker = np.arange(index+2)
                plot.yaxis.axis_label = "Utilization in %"
                show(plot)

### GPU memory

