In [1]:
from utils import (
    get_dataset_size_from_model_size, calculate_total_steps, calculate_total_flops, calculate_flops_per_step,
    calculate_num_h100s_per_step, calculate_total_time_to_train_a_model,
    compute_minimum_latency_between_clusters, calculate_total_minimum_comm_latency_to_train_a_model
)
from utils import (
    convert_to_petaflops, convert_to_exaflops, convert_seconds_to_days,
    convert_to_xt_format, convert_to_million_format, convert_to_billion_format, convert_seconds_to_years
)
from constants import UTILIZED_BFLOAT16_FLOPS, H100_COST_PER_HOUR, H100_COST_PER_GPU
import pandas as pd
import numpy as np

#### Compute

Assumption: one fwd+bwd pass takes a second (if it take x more seconds, then you scale the numbers linearly), 20k per h100 (wholesale price)

In [2]:
# 100b to 100T
target_model_sizes = [100*10**9, 500*10**9, 1000*10**9, 10000*10**9, 100000*10**9]
global_batch_sizes = [x*10**6 for x in [2, 16, 40]]

In [3]:
time_per_step = 1 # the total time of a fwd, and bwd pass

In [4]:
dataframes_compute = []

for global_batch_size in global_batch_sizes:
    data_compute = {
        "Model Size (Params)": [],
        "Dataset Size (Tokens)": [],
        "Global Batch Size": [],
        "Total Steps": [],
        "Total FLOPs": [],
        "FLOPs per Step": [],
        "H100 GPUs Needed": [],
        "Total H100s cost": [],
        "Total Training Time without grad accum": [],
        "Total Training Time with 10 grad accum": [],
        "Total Training Time with 100 grad accum": [],
        "Total Training Time with 1000 grad accum": []
    }
    
    for model_size in target_model_sizes:
        dataset_size = get_dataset_size_from_model_size(model_size)
        total_steps = calculate_total_steps(model_size, global_batch_size)
        total_flops = calculate_total_flops(model_size)
        flops_per_step = calculate_flops_per_step(model_size, global_batch_size)
        h100s_per_step = calculate_num_h100s_per_step(model_size, global_batch_size, UTILIZED_BFLOAT16_FLOPS)
        total_time = calculate_total_time_to_train_a_model(model_size, global_batch_size, time_per_step)
        
        data_compute["Model Size (Params)"].append(convert_to_xt_format(model_size))
        data_compute["Dataset Size (Tokens)"].append(convert_to_xt_format(dataset_size))
        data_compute["Global Batch Size"].append(f'{global_batch_size/1e6}M')
        data_compute["Total Steps"].append("{:,}".format(total_steps))
        data_compute["Total FLOPs"].append(convert_to_exaflops(total_flops))
        data_compute["FLOPs per Step"].append(convert_to_petaflops(flops_per_step))
        data_compute["H100 GPUs Needed"].append(h100s_per_step)
        data_compute["Total H100s cost"].append(convert_to_billion_format(h100s_per_step * H100_COST_PER_GPU))
        data_compute["Total Training Time without grad accum"].append(convert_seconds_to_days(total_time))
        data_compute["Total Training Time with 10 grad accum"].append(f"{convert_seconds_to_years(total_time*10)} - {h100s_per_step/10} gpus")
        data_compute["Total Training Time with 100 grad accum"].append(f"{convert_seconds_to_years(total_time*100)} - {h100s_per_step/100} gpus")
        data_compute["Total Training Time with 1000 grad accum"].append(f"{convert_seconds_to_years(total_time*1000)} - {h100s_per_step/1000} gpus")
    
    df = pd.DataFrame(data_compute)
    # Add batch size information
    # df['Global Batch Size'] = f'{global_batch_size/1e6}M'
    dataframes_compute.append(df)

final_df_compute = pd.DataFrame()
for i, df in enumerate(dataframes_compute):
    final_df_compute = pd.concat([final_df_compute, df])

In [5]:
final_df_compute

Unnamed: 0,Model Size (Params),Dataset Size (Tokens),Global Batch Size,Total Steps,Total FLOPs,FLOPs per Step,H100 GPUs Needed,Total H100s cost,Total Training Time without grad accum,Total Training Time with 10 grad accum,Total Training Time with 100 grad accum,Total Training Time with 1000 grad accum
0,0.1T,2.0T,2.0M,1000000,"1,200,000.0 EFLOPs","1,200.0 PFLOPs",2206.0,0.07B,11.6 days,0.3 years - 220.6 gpus,3.2 years - 22.06 gpus,31.7 years - 2.206 gpus
1,0.5T,10.0T,2.0M,5000000,"30,000,000.0 EFLOPs","6,000.0 PFLOPs",11030.0,0.33B,57.9 days,1.6 years - 1103.0 gpus,15.8 years - 110.3 gpus,158.4 years - 11.03 gpus
2,1.0T,20.0T,2.0M,10000000,"120,000,000.0 EFLOPs","12,000.0 PFLOPs",22060.0,0.66B,115.7 days,3.2 years - 2206.0 gpus,31.7 years - 220.6 gpus,316.9 years - 22.06 gpus
3,10.0T,200.0T,2.0M,100000000,"12,000,000,000.0 EFLOPs","120,000.0 PFLOPs",220608.0,6.62B,1157.4 days,31.7 years - 22060.8 gpus,316.9 years - 2206.08 gpus,3168.8 years - 220.608 gpus
4,100.0T,2000.0T,2.0M,1000000000,"1,200,000,000,000.0 EFLOPs","1,200,000.0 PFLOPs",2206085.0,66.18B,11574.1 days,316.9 years - 220608.5 gpus,3168.8 years - 22060.85 gpus,31688.1 years - 2206.085 gpus
0,0.1T,2.0T,16.0M,125000,"1,200,000.0 EFLOPs","9,600.0 PFLOPs",17648.0,0.53B,1.4 days,0.0 years - 1764.8 gpus,0.4 years - 176.48 gpus,4.0 years - 17.648 gpus
1,0.5T,10.0T,16.0M,625000,"30,000,000.0 EFLOPs","48,000.0 PFLOPs",88243.0,2.65B,7.2 days,0.2 years - 8824.3 gpus,2.0 years - 882.43 gpus,19.8 years - 88.243 gpus
2,1.0T,20.0T,16.0M,1250000,"120,000,000.0 EFLOPs","96,000.0 PFLOPs",176486.0,5.29B,14.5 days,0.4 years - 17648.6 gpus,4.0 years - 1764.86 gpus,39.6 years - 176.486 gpus
3,10.0T,200.0T,16.0M,12500000,"12,000,000,000.0 EFLOPs","960,000.0 PFLOPs",1764868.0,52.95B,144.7 days,4.0 years - 176486.8 gpus,39.6 years - 17648.68 gpus,396.1 years - 1764.868 gpus
4,100.0T,2000.0T,16.0M,125000000,"1,200,000,000,000.0 EFLOPs","9,600,000.0 PFLOPs",17648680.0,529.46B,1446.8 days,39.6 years - 1764868.0 gpus,396.1 years - 176486.8 gpus,3961.0 years - 17648.68 gpus


##### Communication time of data parallelism

In [6]:
from constants import FP8_BYTES, BFLOAT16_BYTES
from utils import convert_bytes_to_terabytes
from utils import calculate_comm_time_given_comm_volume, convert_bytes_to_gigabytes
from constants import NVLINK_MAX_TOTAL_BANDWIDTH

Assume that fwd+bwd pass of a single replicas takes 1 seconds

In [7]:
# comm_bandwidths = [0.5*1024**3, 1*1024**3, 4*1024**3] # bytes/sec
# comm_bandwidths = [40*1024**3, NVLINK_MAX_TOTAL_BANDWIDTH] # bytes/sec
comm_bandwidths = [40*1024**3] # bytes/sec
cluster_sizes = [1024, 10240, 102400]

data_mem = {
    "Model Size (Params)": [],
    "Global batch size": [],
    # "Number of datacenters": [],
    "Total bfloat16 gradient storage": [],
    "Total fp8 gradient storage": [],
    "Bandwidth": [],
    "Total communication time in bfloat16 - comm/compute ratio": [],
    "Total communication time in fp8 - comm/compute ratio": [],
    "Total GPU idle cost for bfloat16 comm": [],
    "Total GPU idle cost for fp8 comm": [],
    "DiLoCo's total communication time in bfloat16 (500 inner steps) - comm/compute ratio": []
}
# for cluster_size in cluster_sizes:
for bandwidth in comm_bandwidths:
    for global_batch_size in global_batch_sizes:
        for model_size in target_model_sizes:
            total_steps = calculate_total_steps(model_size, global_batch_size)
            total_time = calculate_total_time_to_train_a_model(model_size, global_batch_size, time_per_step)
            
            # h100s_per_step = calculate_num_h100s_per_step(model_size, global_batch_size, UTILIZED_BFLOAT16_FLOPS)
            # num_clusters = h100s_per_step // cluster_size

            bfloat16_grad_comm_volume = model_size * BFLOAT16_BYTES
            fp8_grad_comm_volume = model_size * FP8_BYTES
            
            bfloat16_total_comm_time = calculate_comm_time_given_comm_volume(bfloat16_grad_comm_volume, bandwidth) * total_steps
            fp8_total_comm_time = calculate_comm_time_given_comm_volume(fp8_grad_comm_volume, bandwidth) * total_steps
            bfloat16_diloco_total_comm_time = calculate_comm_time_given_comm_volume(fp8_grad_comm_volume, bandwidth) * (total_steps / 500)

            bfloat16_comm_compute_ratio = (bfloat16_total_comm_time / (total_time + bfloat16_total_comm_time)) * 100
            fp8_comm_compute_ratio = (fp8_total_comm_time / (total_time + fp8_total_comm_time)) * 100
            bfloat16_diloco_comm_compute_ratio = (bfloat16_diloco_total_comm_time / (total_time + bfloat16_diloco_total_comm_time)) * 100
            
            bfloat16_total_gpu_idle_cost_comm = ((bfloat16_total_comm_time * h100s_per_step) / (60*60)) / H100_COST_PER_HOUR
            fp8_total_gpu_idle_cost_comm = ((fp8_total_comm_time * h100s_per_step) / (60*60)) / H100_COST_PER_HOUR
            
            data_mem["Model Size (Params)"].append(convert_to_xt_format(model_size))
            data_mem["Global batch size"].append(f'{global_batch_size/1e6}M')
            # data_mem["Number of datacenters"].append(num_clusters)
            data_mem["Total bfloat16 gradient storage"].append(convert_bytes_to_terabytes(model_size * BFLOAT16_BYTES))
            data_mem["Total fp8 gradient storage"].append(convert_bytes_to_terabytes(model_size * FP8_BYTES))
            data_mem["Bandwidth"].append(f"{convert_bytes_to_gigabytes(bandwidth)}/s")
            data_mem["Total communication time in bfloat16 - comm/compute ratio"].append(f"{convert_seconds_to_days(bfloat16_total_comm_time)} / {convert_seconds_to_years(bfloat16_total_comm_time)} - {bfloat16_comm_compute_ratio:.2f}%")
            data_mem["Total communication time in fp8 - comm/compute ratio"].append(f"{convert_seconds_to_days(bfloat16_total_comm_time)} / {convert_seconds_to_years(fp8_total_comm_time)} - {fp8_comm_compute_ratio:.2f}%")
            data_mem["Total GPU idle cost for bfloat16 comm"].append(convert_to_billion_format(bfloat16_total_gpu_idle_cost_comm))
            data_mem["Total GPU idle cost for fp8 comm"].append(convert_to_billion_format(fp8_total_gpu_idle_cost_comm))
            data_mem["DiLoCo's total communication time in bfloat16 (500 inner steps) - comm/compute ratio"].append(f"{convert_seconds_to_days(bfloat16_diloco_total_comm_time)}  - {bfloat16_diloco_comm_compute_ratio:.2f}%")
    
    df_mem = pd.DataFrame(data_mem)

Add cluster size

In [8]:
df_mem

Unnamed: 0,Model Size (Params),Global batch size,Total bfloat16 gradient storage,Total fp8 gradient storage,Bandwidth,Total communication time in bfloat16 - comm/compute ratio,Total communication time in fp8 - comm/compute ratio,Total GPU idle cost for bfloat16 comm,Total GPU idle cost for fp8 comm,DiLoCo's total communication time in bfloat16 (500 inner steps) - comm/compute ratio
0,0.1T,2.0M,0.200 TB,0.100 TB,42.950 GB/s,53.9 days / 0.1 years - 82.32%,53.9 days / 0.1 years - 69.95%,28.54B,14.27B,0.1 days - 0.46%
1,0.5T,2.0M,1.000 TB,0.500 TB,42.950 GB/s,1347.4 days / 3.7 years - 95.88%,1347.4 days / 1.8 years - 92.09%,713.39B,356.70B,1.3 days - 2.28%
2,1.0T,2.0M,2.000 TB,1.000 TB,42.950 GB/s,5389.6 days / 14.8 years - 97.90%,5389.6 days / 7.4 years - 95.88%,2853.58B,1426.79B,5.4 days - 4.45%
3,10.0T,2.0M,20.000 TB,10.000 TB,42.950 GB/s,538959.8 days / 1475.6 years - 99.79%,538959.8 days / 737.8 years - 99.57%,285357.90B,142678.95B,539.0 days - 31.77%
4,100.0T,2.0M,200.000 TB,100.000 TB,42.950 GB/s,53895982.3 days / 147559.2 years - 99.98%,53895982.3 days / 73779.6 years - 99.96%,28535789.65B,14267894.83B,53896.0 days - 82.32%
5,0.1T,16.0M,0.200 TB,0.100 TB,42.950 GB/s,6.7 days / 0.0 years - 82.32%,6.7 days / 0.0 years - 69.95%,3.57B,1.78B,0.0 days - 0.46%
6,0.5T,16.0M,1.000 TB,0.500 TB,42.950 GB/s,168.4 days / 0.5 years - 95.88%,168.4 days / 0.2 years - 92.09%,89.17B,44.59B,0.2 days - 2.28%
7,1.0T,16.0M,2.000 TB,1.000 TB,42.950 GB/s,673.7 days / 1.8 years - 97.90%,673.7 days / 0.9 years - 95.88%,356.70B,178.35B,0.7 days - 4.45%
8,10.0T,16.0M,20.000 TB,10.000 TB,42.950 GB/s,67370.0 days / 184.4 years - 99.79%,67370.0 days / 92.2 years - 99.57%,35669.74B,17834.87B,67.4 days - 31.77%
9,100.0T,16.0M,200.000 TB,100.000 TB,42.950 GB/s,6736997.8 days / 18444.9 years - 99.98%,6736997.8 days / 9222.4 years - 99.96%,3566973.71B,1783486.85B,6737.0 days - 82.32%


#### Communication latency (theoretical minimum)

Assumptions on communication
- No limit on banwidth
- Achieve speed of light
- Clostest surface distance between two points on the earth surface (assume you don't dig a crazy hole to go a straight line)

In [9]:
minimum_latency_between_jz_and_jc = compute_minimum_latency_between_clusters("JEAN_ZAY", "JOLIOT_CURIE")
minimum_latency_between_jz_and_ec = compute_minimum_latency_between_clusters("JEAN_ZAY", "EL_CAPITAN")

In [10]:
dataframes_comm = []

for global_batch_size in global_batch_sizes:
    data_comm = {
        "Model Size (Params)": [],
        "Dataset Size (Tokens)": [],
        "Global Batch Size": [],
        
        "Total minimum communication latency between JZ and JC": [],
        "Total GPU idle time for minimum comm between JZ and JC": [],
        "GPU idle cost during JZ-JC minimum communication latency": [],
        
        "Total minimum communication latency between JZ and EC": [],
        "Total GPU idle time for minimum comm between JZ and EC": [],
        "GPU idle cost during JZ-EC minimum communication latency": [],
    }
    
    for model_size in target_model_sizes:
        dataset_size = get_dataset_size_from_model_size(model_size)
        # Append to dictionary
        data_comm["Model Size (Params)"].append(convert_to_xt_format(model_size))
        data_comm["Dataset Size (Tokens)"].append(convert_to_xt_format(dataset_size))
        data_comm["Global Batch Size"].append(f'{global_batch_size/1e6}M')
        
        data_comm["Total minimum communication latency between JZ and JC"].append(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc))
        data_comm["Total GPU idle time for minimum comm between JZ and JC"].append(convert_seconds_to_days(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc) * h100s_per_step))
        data_comm["GPU idle cost during JZ-JC minimum communication latency"].append(convert_to_million_format((calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc) / (60 * 60)) * h100s_per_step * H100_COST_PER_HOUR))
        
        data_comm["Total minimum communication latency between JZ and EC"].append(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec))
        data_comm["Total GPU idle time for minimum comm between JZ and EC"].append(convert_seconds_to_years(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec) * h100s_per_step))
        data_comm["GPU idle cost during JZ-EC minimum communication latency"].append(convert_to_million_format((calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec) / (60 * 60)) * h100s_per_step * H100_COST_PER_HOUR))
    
    # Convert to DataFrame
    df_comm = pd.DataFrame(data_comm)
    # df_comm['Global Batch Size'] = f'{global_batch_size/1e6}M'
    dataframes_comm.append(df_comm)

final_df_comm = pd.DataFrame()
for i, df in enumerate(dataframes_comm):
    final_df_comm = pd.concat([final_df_comm, df])

In [11]:
final_df_comm

Unnamed: 0,Model Size (Params),Dataset Size (Tokens),Global Batch Size,Total minimum communication latency between JZ and JC,Total GPU idle time for minimum comm between JZ and JC,GPU idle cost during JZ-JC minimum communication latency,Total minimum communication latency between JZ and EC,Total GPU idle time for minimum comm between JZ and EC,GPU idle cost during JZ-EC minimum communication latency
0,0.1T,2.0T,2.0M,1.27827,652.8 days,0.0m,29.790336,41.7 years,0.7m
1,0.5T,10.0T,2.0M,6.391352,3263.9 days,0.2m,148.951681,208.3 years,3.7m
2,1.0T,20.0T,2.0M,12.782705,6527.7 days,0.3m,297.903362,416.5 years,7.3m
3,10.0T,200.0T,2.0M,127.827046,65277.2 days,3.1m,2979.033618,4165.1 years,73.0m
4,100.0T,2000.0T,2.0M,1278.270459,652771.6 days,31.3m,29790.336181,41650.8 years,730.2m
0,0.1T,2.0T,16.0M,0.159784,81.6 days,0.0m,3.723792,5.2 years,0.1m
1,0.5T,10.0T,16.0M,0.798919,408.0 days,0.0m,18.61896,26.0 years,0.5m
2,1.0T,20.0T,16.0M,1.597838,816.0 days,0.0m,37.23792,52.1 years,0.9m
3,10.0T,200.0T,16.0M,15.978381,8159.6 days,0.4m,372.379202,520.6 years,9.1m
4,100.0T,2000.0T,16.0M,159.783807,81596.5 days,3.9m,3723.792023,5206.4 years,91.3m


#### Electricity

In [12]:
from constants import TOTAL_H100_WATT
from utils import convert_watts_to_megawatts, convert_watts_to_terawatts, calculate_electricity_consumption_of_an_h100

In [13]:
dataframes_elec = []

for global_batch_size in global_batch_sizes:
    data_elec = {
        "Model Size (Params)": [],
        "Dataset Size (Tokens)": [],
        "Global batch size": [],
        "Number of GPUs": [],
        "Total electricity per step (without grad accum)": [],
        "Total electricity for the entire training (without grad accum)": []
    }
    
    for model_size in target_model_sizes:
        dataset_size = get_dataset_size_from_model_size(model_size)
        h100s_per_step = calculate_num_h100s_per_step(model_size, global_batch_size, UTILIZED_BFLOAT16_FLOPS)
        total_time = calculate_total_time_to_train_a_model(model_size, global_batch_size, time_per_step)
        total_electricity_consumption = calculate_electricity_consumption_of_an_h100(TOTAL_H100_WATT, total_time) * h100s_per_step
        
        data_elec["Model Size (Params)"].append(convert_to_xt_format(model_size))
        data_elec["Dataset Size (Tokens)"].append(convert_to_xt_format(dataset_size))
        data_elec["Global batch size"].append(f'{global_batch_size/1e6}M')
        data_elec["Number of GPUs"].append(h100s_per_step)
        data_elec["Total electricity per step (without grad accum)"].append(convert_watts_to_megawatts(h100s_per_step * TOTAL_H100_WATT))
        data_elec["Total electricity for the entire training (without grad accum)"].append(f"{convert_watts_to_terawatts(total_electricity_consumption)}")
    
    df = pd.DataFrame(data_elec)
    # df['Global Batch Size'] = f'{global_batch_size/1e6}M'
    dataframes_elec.append(df)

final_df_elec = pd.DataFrame()
for i, df in enumerate(dataframes_elec):
    final_df_elec = pd.concat([final_df_elec, df])

In [14]:
# my calculation closes to 100k gpu cluster's electricity: https://semianalysis.com/2024/06/17/100000-h100-clusters-power-network/#power-challenges

In [15]:
final_df_elec

Unnamed: 0,Model Size (Params),Dataset Size (Tokens),Global batch size,Number of GPUs,Total electricity per step (without grad accum),Total electricity for the entire training (without grad accum)
0,0.1T,2.0T,2.0M,2206.0,2.813 MW,2.812650000000 TW
1,0.5T,10.0T,2.0M,11030.0,14.063 MW,70.316250000000 TW
2,1.0T,20.0T,2.0M,22060.0,28.127 MW,281.265000000000 TW
3,10.0T,200.0T,2.0M,220608.0,281.275 MW,28127.520000000000 TW
4,100.0T,2000.0T,2.0M,2206085.0,2812.758 MW,2812758.375000000000 TW
0,0.1T,2.0T,16.0M,17648.0,22.501 MW,2.812650000000 TW
1,0.5T,10.0T,16.0M,88243.0,112.510 MW,70.318640625000 TW
2,1.0T,20.0T,16.0M,176486.0,225.020 MW,281.274562500000 TW
3,10.0T,200.0T,16.0M,1764868.0,2250.207 MW,28127.583750000002 TW
4,100.0T,2000.0T,16.0M,17648680.0,22502.067 MW,2812758.375000000000 TW
