# Benchmarking YOLOv8 on EVERYTHING using ClearML

In [None]:
from clearml import Task

# Get the base task to clone
template_task = Task.get_task(task_id='c5290af4eeec49a083289dd8e1e5e62d')

In [2]:
queues = [
    # AWS
    "g4dn.xlarge",
    "g4dn.2xlarge",
    "g4dn.4xlarge",
    "g4dn.8xlarge",
    "p3.2xlarge",
    "g5.xlarge",
    "g5.2xlarge",
    # Genesis
    "3060Ti",
    "3080",
    "3090",
    "3080_Optimized",
    "3090_Optimized"
]

In [3]:
ideal_batch_size = {
    # AWS
    "g4dn.xlarge": 16,
    "g4dn.2xlarge": 16,
    "g4dn.4xlarge": 16,
    "g4dn.8xlarge": 16,
    "p3.2xlarge": 32,
    "g5.xlarge": 32,
    "g5.2xlarge": 32,
    # Genesis
    "3060Ti": 4,
    "3080": 8,
    "3090": 16,
    "3080_Optimized": 8,
    "3090_Optimized": 16
}

In [4]:
aws_cost_mapping = {
    "g4dn.xlarge": 0.5870,
    "g4dn.2xlarge": 0.8380,
    "g4dn.4xlarge": 1.3420,
    "g4dn.8xlarge": 2.4260,
    "p2.xlarge": 0.9720,
    "g2.2xlarge": 0.7020,
    "p3.2xlarge": 3.3050
}

genesis_cost_mapping = {
    "3060Ti": 0.65,
    "3080": 0.90,
    "3090": 1.30,
    "3080_Optimized": 1,
    "3090_Optimized": 1.40
}

clearml_cost_mapping = {
    "3060Ti": 0.39,
    "3080": 0.54,
    "3090": 0.78,
    "3080_Optimized": 0.6,
    "3090_Optimized": 0.84
}

In [5]:
import numpy as np
import pickle
import time

import plotly.graph_objects as go


def get_task_runtimes(tasks):
    runtimes = {
        task.name.replace("YOLOv8_", ""): task.get_reported_scalars()['Epoch Time']['Epoch Time']['y'][2:-2]
        for task in tasks
        if 'Epoch Time' in task.get_reported_scalars()
    }
    return runtimes

def get_lastest_tasks(queues, project_name="YOLOv8"):
    tasks = []
    for queue_name in queues:
        t = Task.get_task(project_name=project_name, task_name=f"YOLOv8_{queue_name}")
        if t and t.status not in ["queued", "draft"]:
            tasks.append(t)
    return tasks

def clone_and_enqueue(task, queue, epochs=100, batch_size=None, tags=None, dataset=None):
    new_task = Task.clone(source_task=task)
    new_task.update_parameters({"General/epochs": epochs})
    if batch_size:
        new_task.update_parameters({"General/batch": batch_size})
    if tags:
        new_task.add_tags(tags)
    if dataset:
        new_task.update_parameters({"General/data": dataset})
    new_task.rename(f"YOLOv8_{queue}")
    Task.enqueue(task=new_task, queue_name=queue)
    return new_task

def poll_status(tasks):
    flag = False
    while flag == False:
        flag = True
        for task in tasks:
            # Check if succeeded
            print(task.status)
            if task.status == 'failure':
                print(f"[FAILED] Task {task.name} has failed!!")
            elif task.status != 'completed':
                flag = False
        print("===============")
        time.sleep(5)
    print("All done!")

def calc_performance_per_dollar(runtimes):
    efficiency = {}
    for instance_name, timings in runtimes.items():
        avg_timing = np.mean(timings)
        epochs_per_hour = 3600 / avg_timing
        price = aws_cost_mapping.get(instance_name, None)
        if price:
            efficiency[instance_name] = (avg_timing, (1/price)*epochs_per_hour)
        else:
            efficiency[f"genesis_{instance_name}"] = (avg_timing, (1/genesis_cost_mapping[instance_name])*epochs_per_hour)
            efficiency[f"clearml_{instance_name}"] = (avg_timing, (1/clearml_cost_mapping[instance_name])*epochs_per_hour)

    return efficiency

def plot_performance_per_dollar(performance_per_dollar):
    x, y = zip(*sorted(performance_per_dollar.items(), key=lambda x: -x[1][1])) # unpack a list of pairs into two tuples
    rt = [r[0] for r in y]
    ef = [e[1] for e in y]

    # fig = go.Figure(data=[
    #     go.Bar(name='Average Epoch Runtime (In seconds, lower is better)', x=x, y=rt),
    #     go.Bar(name='Cost Efficiency (Epochs/Dollar - higher is better)', x=x, y=ef)
    # ])
    # # Change the bar mode
    # fig.update_layout(title="Model Training Cost Efficiency", barmode='group')
    # fig.show()

    fig = go.Figure(
        data=[
            go.Bar(name='Cost Efficiency (Epochs/Dollar - higher is better)', x=x, y=ef)
        ]
    )
    # Change the bar mode
    fig.update_layout(title="Model Training Cost Efficiency", barmode='group')
    fig.show()

def plot_performance_and_cost(runtimes):
    prices = []
    speeds = []
    instance_names = []
    for instance_name, timings in runtimes.items():
        avg_timing = np.mean(timings)
        epochs_per_hour = 3600 / avg_timing
        price = aws_cost_mapping.get(instance_name, None)
        if price:
            speeds.append(epochs_per_hour)
            instance_names.append(instance_name)
            prices.append(price)
        else:
            instance_names.append(f"genesis_{instance_name}")
            prices.append(genesis_cost_mapping[instance_name])
            speeds.append(epochs_per_hour)
            instance_names.append(f"clearml_{instance_name}")
            prices.append(clearml_cost_mapping[instance_name])
            speeds.append(epochs_per_hour)

    fig = go.Figure(
        data=[
            go.Bar(name='Training Speed (Epochs/H - Higher is Better)', yaxis="y", x=instance_names, y=speeds, offsetgroup=1),
            go.Bar(name='Cost (Dollars/H - Lower is Better)', yaxis="y2", x=instance_names, y=prices, offsetgroup=2)
        ],
        layout={
            'yaxis': {'title': 'Epochs'},
            'yaxis2': {'title': 'Dollars/H', 'overlaying': 'y', 'side': 'right'}
        }
    )
    # Change the bar mode
    fig.update_layout(title="Model Training Performance vs Cost", barmode='group')
    fig.show()

# All same batch size

In [None]:
tasks = []
for queue in queues:
    tasks.append(clone_and_enqueue(template_task, queue))

In [67]:
tasks = get_lastest_tasks(queues, project_name="YOLOv8/Clean Run Fixed Batch Size")
[task.name for task in tasks]

['YOLOv8_g4dn.xlarge',
 'YOLOv8_g4dn.2xlarge',
 'YOLOv8_g4dn.4xlarge',
 'YOLOv8_g4dn.8xlarge',
 'YOLOv8_3060Ti',
 'YOLOv8_3080',
 'YOLOv8_3090',
 'YOLOv8_3080_Optimized',
 'YOLOv8_3090_Optimized']

In [63]:
runtimes = get_task_runtimes(tasks)
performance_per_dollar = calc_performance_per_dollar(runtimes)
performance_per_dollar

{'g4dn.xlarge': (5.384898533423741, 1138.9033624180995),
 'g4dn.2xlarge': (5.147748236854871, 834.5285206466165),
 'g4dn.4xlarge': (4.705686276157697, 570.0684620419318),
 'g4dn.8xlarge': (4.508692234754562, 329.12518258599073),
 'genesis_3060Ti': (5.750945294896762, 963.0523773850229),
 'clearml_3060Ti': (5.750945294896762, 1605.087295641705),
 'genesis_3080': (5.575009639064471, 717.4875487159212),
 'clearml_3080': (5.575009639064471, 1195.8125811932018),
 'genesis_3090': (5.4650935480992, 506.7124185264729),
 'clearml_3090': (5.4650935480992, 844.5206975441215),
 'genesis_3080_Optimized': (5.144766345620155, 699.740232725001),
 'clearml_3080_Optimized': (5.144766345620155, 1166.233721208335),
 'genesis_3090_Optimized': (5.192823871970177, 495.18886733452064),
 'clearml_3090_Optimized': (5.192823871970177, 825.3147788908677)}

In [71]:
plot_performance_per_dollar(performance_per_dollar)

In [73]:
plot_performance_and_cost(runtimes)

In [76]:
new_runtimes = {}
for key in ["g4dn.2xlarge", "3090_Optimized"]:
    new_runtimes[key] = runtimes[key]
plot_performance_and_cost(new_runtimes)


# Getting the right batch size

So far, the batch size was the default of 16, so what if we tune the batch sizes to fit more to the available free GPU memory. I checked this manually by running and looking for OOMs

In [16]:
tasks = []
for batch_size in [4, 8, 16, 32, 64]:
    for queue in queues:
        tasks.append(clone_and_enqueue(template_task, queue, epochs=10, batch_size=batch_size, tags=['BATCH_SIZE_TEST', 'VISDRONE'], dataset="VisDrone.yaml"))

In [None]:
# coco 128, but not much use
# ideal_batch_size = {
#     # AWS
#     "g4dn.xlarge": 32,
#     "g4dn.2xlarge": 32,
#     "g4dn.4xlarge": 32,
#     "g4dn.8xlarge": 32,
#     # "p2.xlarge": 64,
#     # "g2.2xlarge",
#     # "p3.2xlarge",
#     # Genesis
#     "3060Ti": 16,
#     "3080": 32,
#     "3090": 64,
#     "3080_Optimized": 32,
#     "3090_Optimized": 64
# }

# For VisDrone
ideal_batch_size = {
    # AWS
    "g4dn.xlarge": 16,
    "g4dn.2xlarge": 16,
    "g4dn.4xlarge": 16,
    "g4dn.8xlarge": 16,
    # "p2.xlarge": 64,
    # "g2.2xlarge",
    # "p3.2xlarge",
    # Genesis
    "3060Ti": 4,
    "3080": 8,
    "3090": 16,
    "3080_Optimized": 8,
    "3090_Optimized": 16
}

In [None]:
poll_status(tasks)

In [53]:
tasks = get_lastest_tasks(queues, project_name="YOLOv8")
[task.name for task in tasks]



['YOLOv8_g4dn.xlarge',
 'YOLOv8_g4dn.2xlarge',
 'YOLOv8_g4dn.4xlarge',
 'YOLOv8_g4dn.8xlarge',
 'YOLOv8_3060Ti',
 'YOLOv8_3080',
 'YOLOv8_3090',
 'YOLOv8_3080_Optimized']

In [54]:
runtimes = get_task_runtimes(tasks)
performance_per_dollar = calc_performance_per_dollar("tuned_bs.pkl", runtimes)
performance_per_dollar

{'g4dn.xlarge': (5.338821838299434, 9.09509682844878),
 'g4dn.2xlarge': (5.2651630491018295, 6.283010798450872),
 'g4dn.4xlarge': (4.67342293759187, 3.482431399099754),
 'g4dn.8xlarge': (4.516593351960182, 1.861744992563966),
 'genesis_3060Ti': (5.824768727166312, 8.96118265717894),
 'clearml_3060Ti': (5.824768727166312, 14.935304428631568),
 'genesis_3080': (5.755358040332794, 6.394842267036438),
 'clearml_3080': (5.755358040332794, 10.658070445060728),
 'genesis_3090': (5.447098915775617, 4.1900760890581665),
 'clearml_3090': (5.447098915775617, 6.983460148430277),
 'genesis_3080_Optimized': (5.252548644940059, 5.252548644940059),
 'clearml_3080_Optimized': (5.252548644940059, 8.754247741566765)}

In [55]:
plot_performance_per_dollar(performance_per_dollar)

# Higher batch size is slower??

In [28]:
import numpy as np

data = {
    16: [],
    32: [],
    64: []
}

order = {
    16: [],
    32: [],
    64: []
}

for i, machine_type in enumerate(queues):
    tasks = Task.get_tasks(project_name="YOLOv8", tags=["BATCH_SIZE_TEST"], task_name=f"^YOLOv8_{machine_type}$", task_filter={'status': ['completed']})
    for task in tasks:
        batch_size = int(task.get_parameter("General/batch"))
        order[batch_size].append(task.name.replace("YOLOv8_", ""))
        print(task.name, batch_size)
        runtimes = get_task_runtimes([task])
        if runtimes:
            epoch_time = np.mean(runtimes[machine_type])
            data[batch_size].append(epoch_time)
        else:
            data[batch_size].append(0)

YOLOv8_g4dn.xlarge 16
YOLOv8_g4dn.xlarge 32
YOLOv8_g4dn.2xlarge 16
YOLOv8_g4dn.2xlarge 32
YOLOv8_g4dn.4xlarge 16
YOLOv8_g4dn.4xlarge 32
YOLOv8_g4dn.8xlarge 16
YOLOv8_g4dn.8xlarge 32
YOLOv8_3060Ti 16
YOLOv8_3080 16
YOLOv8_3080 32
YOLOv8_3090 16
YOLOv8_3090 32
YOLOv8_3090 64
YOLOv8_3080_Optimized 16
YOLOv8_3080_Optimized 32
YOLOv8_3090_Optimized 16
YOLOv8_3090_Optimized 32
YOLOv8_3090_Optimized 64


In [29]:
fig = go.Figure(data=[
    go.Bar(name='Batch 16', x=order[16], y=data[16]),
    go.Bar(name='batch 32', x=order[32], y=data[32]),
    go.Bar(name='batch 64', x=order[64], y=data[64])
])
# Change the bar mode
fig.update_layout(title="Average epoch runtime", barmode='group')
fig.show()

# Batchsize Aware Training

In [7]:
tasks = []
for queue in queues:
    tasks.append(clone_and_enqueue(template_task, queue, epochs=10, batch_size=ideal_batch_size[queue], tags=['VISDRONE'], dataset="VisDrone.yaml"))
    tasks.append(clone_and_enqueue(template_task, queue, epochs=10, batch_size=ideal_batch_size[queue]//2, tags=['VISDRONE'], dataset="VisDrone.yaml"))