In [1]:
from utils import (
    get_dataset_size_from_model_size, calculate_total_steps, calculate_total_flops, calculate_flops_per_step,
    calculate_num_h100s_per_step, calculate_total_time_to_train_a_model,
    compute_minimum_latency_between_clusters, calculate_total_minimum_comm_latency_to_train_a_model
)
from utils import (
    convert_to_petaflops, convert_to_exaflops, convert_seconds_to_days,
    convert_to_xt_format, convert_to_million_format, convert_to_billion_format, convert_seconds_to_years
)
from constants import UTILIZED_BFLOAT16_FLOPS, H100_COST_PER_HOUR, H100_COST_PER_GPU
import pandas as pd
import numpy as np

#### Compute

Assumption: one fwd+bwd pass takes a second (if it take x more seconds, then you scale the numbers linearly), 20k per h100 (wholesale price)

In [2]:
# 100b to 100T
target_model_sizes = [100*10**9, 500*10**9, 1000*10**9, 10000*10**9, 100000*10**9]
global_batch_sizes = [x*10**6 for x in [2, 16, 40]]

In [3]:
time_per_step = 1 # the total time of a fwd, and bwd pass

In [4]:
dataframes_compute = []

for global_batch_size in global_batch_sizes:
    data_compute = {
        "Model Size (Params)": [],
        "Dataset Size (Tokens)": [],
        "Total Steps": [],
        "Total FLOPs": [],
        "FLOPs per Step": [],
        "H100 GPUs Needed": [],
        "Total H100s cost": [],
        "Total Training Time without grad accum": [],
        "Total Training Time with 10 grad accum": [],
        "Total Training Time with 100 grad accum": [],
        "Total Training Time with 1000 grad accum": []
    }
    
    for model_size in target_model_sizes:
        dataset_size = get_dataset_size_from_model_size(model_size)
        total_steps = calculate_total_steps(model_size, global_batch_size)
        total_flops = calculate_total_flops(model_size)
        flops_per_step = calculate_flops_per_step(model_size, global_batch_size)
        h100s_per_step = calculate_num_h100s_per_step(model_size, global_batch_size, UTILIZED_BFLOAT16_FLOPS)
        total_time = calculate_total_time_to_train_a_model(model_size, global_batch_size, time_per_step)
        
        data_compute["Model Size (Params)"].append(convert_to_xt_format(model_size))
        data_compute["Dataset Size (Tokens)"].append(convert_to_xt_format(dataset_size))
        data_compute["Total Steps"].append("{:,}".format(total_steps))
        data_compute["Total FLOPs"].append(convert_to_exaflops(total_flops))
        data_compute["FLOPs per Step"].append(convert_to_petaflops(flops_per_step))
        data_compute["H100 GPUs Needed"].append(h100s_per_step)
        data_compute["Total H100s cost"].append(convert_to_billion_format(h100s_per_step * H100_COST_PER_GPU))
        data_compute["Total Training Time without grad accum"].append(convert_seconds_to_days(total_time))
        data_compute["Total Training Time with 10 grad accum"].append(f"{convert_seconds_to_years(total_time*10)} - {h100s_per_step/10} gpus")
        data_compute["Total Training Time with 100 grad accum"].append(f"{convert_seconds_to_years(total_time*100)} - {h100s_per_step/100} gpus")
        data_compute["Total Training Time with 1000 grad accum"].append(f"{convert_seconds_to_years(total_time*1000)} - {h100s_per_step/1000} gpus")
    
    df = pd.DataFrame(data_compute)
    # Add batch size information
    df['Global Batch Size'] = f'{global_batch_size/1e6}M'
    dataframes_compute.append(df)

final_df_compute = pd.DataFrame()
for i, df in enumerate(dataframes_compute):
    final_df_compute = pd.concat([final_df_compute, df])

In [5]:
final_df_compute

Unnamed: 0,Model Size (Params),Dataset Size (Tokens),Total Steps,Total FLOPs,FLOPs per Step,H100 GPUs Needed,Total H100s cost,Total Training Time without grad accum,Total Training Time with 10 grad accum,Total Training Time with 100 grad accum,Total Training Time with 1000 grad accum,Global Batch Size
0,0.1T,2.0T,1000000,"1,200,000.0 EFLOPs","1,200.0 PFLOPs",2206.0,0.1B,11.574074074074074 days,0.0 years - 220.6 gpus,3.0 years - 22.06 gpus,31.0 years - 2.206 gpus,2.0M
1,0.5T,10.0T,5000000,"30,000,000.0 EFLOPs","6,000.0 PFLOPs",11030.0,0.3B,57.870370370370374 days,1.0 years - 1103.0 gpus,15.0 years - 110.3 gpus,158.0 years - 11.03 gpus,2.0M
2,1.0T,20.0T,10000000,"120,000,000.0 EFLOPs","12,000.0 PFLOPs",22060.0,0.7B,115.74074074074075 days,3.0 years - 2206.0 gpus,31.0 years - 220.6 gpus,316.0 years - 22.06 gpus,2.0M
3,10.0T,200.0T,100000000,"12,000,000,000.0 EFLOPs","120,000.0 PFLOPs",220608.0,6.6B,"1,157.4074074074074 days",31.0 years - 22060.8 gpus,316.0 years - 2206.08 gpus,"3,168.0 years - 220.608 gpus",2.0M
4,100.0T,2000.0T,1000000000,"1,200,000,000,000.0 EFLOPs","1,200,000.0 PFLOPs",2206085.0,66.2B,"11,574.074074074075 days",316.0 years - 220608.5 gpus,"3,168.0 years - 22060.85 gpus","31,688.0 years - 2206.085 gpus",2.0M
0,0.1T,2.0T,125000,"1,200,000.0 EFLOPs","9,600.0 PFLOPs",17648.0,0.5B,1.4467592592592593 days,0.0 years - 1764.8 gpus,0.0 years - 176.48 gpus,3.0 years - 17.648 gpus,16.0M
1,0.5T,10.0T,625000,"30,000,000.0 EFLOPs","48,000.0 PFLOPs",88243.0,2.6B,7.233796296296297 days,0.0 years - 8824.3 gpus,1.0 years - 882.43 gpus,19.0 years - 88.243 gpus,16.0M
2,1.0T,20.0T,1250000,"120,000,000.0 EFLOPs","96,000.0 PFLOPs",176486.0,5.3B,14.467592592592593 days,0.0 years - 17648.6 gpus,3.0 years - 1764.86 gpus,39.0 years - 176.486 gpus,16.0M
3,10.0T,200.0T,12500000,"12,000,000,000.0 EFLOPs","960,000.0 PFLOPs",1764868.0,52.9B,144.67592592592592 days,3.0 years - 176486.8 gpus,39.0 years - 17648.68 gpus,396.0 years - 1764.868 gpus,16.0M
4,100.0T,2000.0T,125000000,"1,200,000,000,000.0 EFLOPs","9,600,000.0 PFLOPs",17648680.0,529.5B,"1,446.7592592592594 days",39.0 years - 1764868.0 gpus,396.0 years - 176486.8 gpus,"3,961.0 years - 17648.68 gpus",16.0M


##### Memory

In [6]:
from constants import FP8_BYTES, BFLOAT16_BYTES
from utils import convert_bytes_to_terabytes

In [9]:
data_mem = {
    "Model Size (Params)": [],
    "Total bfloat16 gradient storage": [],
    "Total fp8 gradient storage": [],
}

for model_size in target_model_sizes:        
    data_mem["Model Size (Params)"].append(convert_to_xt_format(model_size))
    data_mem["Total bfloat16 gradient storage"].append(convert_bytes_to_terabytes(model_size * BFLOAT16_BYTES))
    data_mem["Total fp8 gradient storage"].append(convert_bytes_to_terabytes(model_size * FP8_BYTES))

df_mem = pd.DataFrame(data_mem)

In [10]:
df_mem

Unnamed: 0,Model Size (Params),Total bfloat16 gradient storage,Total fp8 gradient storage
0,0.1T,0.200 TB,0.100 TB
1,0.5T,1.000 TB,0.500 TB
2,1.0T,2.000 TB,1.000 TB
3,10.0T,20.000 TB,10.000 TB
4,100.0T,200.000 TB,100.000 TB


#### Communication

Assumptions on communication
- No limit on banwidth
- Achieve speed of light
- Clostest surface distance between two points on the earth surface (assume you don't dig a crazy hole to go a straight line)

In [6]:
minimum_latency_between_jz_and_jc = compute_minimum_latency_between_clusters("JEAN_ZAY", "JOLIOT_CURIE")
minimum_latency_between_jz_and_ec = compute_minimum_latency_between_clusters("JEAN_ZAY", "EL_CAPITAN")

In [7]:
dataframes_comm = []

for global_batch_size in global_batch_sizes:
    data_comm = {
        "Model Size (Params)": [],
        "Dataset Size (Tokens)": [],
        
        "Total minimum communication latency between JZ and JC": [],
        "Total GPU idle time for minimum comm between JZ and JC": [],
        "GPU idle cost during JZ-JC minimum communication latency": [],
        
        "Total minimum communication latency between JZ and EC": [],
        "Total GPU idle time for minimum comm between JZ and EC": [],
        "GPU idle cost during JZ-EC minimum communication latency": [],
    }
    
    for model_size in target_model_sizes:
        dataset_size = get_dataset_size_from_model_size(model_size)
        # Append to dictionary
        data_comm["Model Size (Params)"].append(convert_to_xt_format(model_size))
        data_comm["Dataset Size (Tokens)"].append(convert_to_xt_format(dataset_size))
        
        data_comm["Total minimum communication latency between JZ and JC"].append(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc))
        data_comm["Total GPU idle time for minimum comm between JZ and JC"].append(convert_seconds_to_days(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc) * h100s_per_step))
        data_comm["GPU idle cost during JZ-JC minimum communication latency"].append(convert_to_million_format((calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc) / (60 * 60)) * h100s_per_step * H100_COST_PER_HOUR))
        
        data_comm["Total minimum communication latency between JZ and EC"].append(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec))
        data_comm["Total GPU idle time for minimum comm between JZ and EC"].append(convert_seconds_to_years(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec) * h100s_per_step))
        data_comm["GPU idle cost during JZ-EC minimum communication latency"].append(convert_to_million_format((calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec) / (60 * 60)) * h100s_per_step * H100_COST_PER_HOUR))
    
    # Convert to DataFrame
    df_comm = pd.DataFrame(data_comm)
    df_comm['Global Batch Size'] = f'{global_batch_size/1e6}M'
    dataframes_comm.append(df_comm)

final_df_comm = pd.DataFrame()
for i, df in enumerate(dataframes_comm):
    final_df_comm = pd.concat([final_df_comm, df])

In [8]:
final_df_comm

Unnamed: 0,Model Size (Params),Dataset Size (Tokens),Total minimum communication latency between JZ and JC,Total GPU idle time for minimum comm between JZ and JC,GPU idle cost during JZ-JC minimum communication latency,Total minimum communication latency between JZ and EC,Total GPU idle time for minimum comm between JZ and EC,GPU idle cost during JZ-EC minimum communication latency,Global Batch Size
0,0.1T,2.0T,1.27827,652.7716235955164 days,0.0m,29.790336,41.0 years,0.7m,2.0M
1,0.5T,10.0T,6.391352,"3,263.858117977583 days",0.2m,148.951681,208.0 years,3.7m,2.0M
2,1.0T,20.0T,12.782705,"6,527.716235955166 days",0.3m,297.903362,416.0 years,7.3m,2.0M
3,10.0T,200.0T,127.827046,"65,277.16235955166 days",3.1m,2979.033618,"4,165.0 years",73.0m,2.0M
4,100.0T,2000.0T,1278.270459,"652,771.6235955165 days",31.3m,29790.336181,"41,650.0 years",730.2m,2.0M
0,0.1T,2.0T,0.159784,81.59645294943955 days,0.0m,3.723792,5.0 years,0.1m,16.0M
1,0.5T,10.0T,0.798919,407.98226474719786 days,0.0m,18.61896,26.0 years,0.5m,16.0M
2,1.0T,20.0T,1.597838,815.9645294943957 days,0.0m,37.23792,52.0 years,0.9m,16.0M
3,10.0T,200.0T,15.978381,"8,159.645294943957 days",0.4m,372.379202,520.0 years,9.1m,16.0M
4,100.0T,2000.0T,159.783807,"81,596.45294943957 days",3.9m,3723.792023,"5,206.0 years",91.3m,16.0M


#### Electricity

In [9]:
from constants import TOTAL_H100_WATT
from utils import convert_watts_to_megawatts, convert_watts_to_terawatts, calculate_electricity_consumption_of_an_h100

In [10]:
dataframes_elec = []

for global_batch_size in global_batch_sizes:
    data_elec = {
        "Model Size (Params)": [],
        "Dataset Size (Tokens)": [],
        "Number of GPUs": [],
        "Total electricity per step (without grad accum)": [],
        "Total electricity for the entire training (without grad accum)": []
    }
    
    for model_size in target_model_sizes:
        dataset_size = get_dataset_size_from_model_size(model_size)
        h100s_per_step = calculate_num_h100s_per_step(model_size, global_batch_size, UTILIZED_BFLOAT16_FLOPS)
        total_time = calculate_total_time_to_train_a_model(model_size, global_batch_size, time_per_step)
        total_electricity_consumption = calculate_electricity_consumption_of_an_h100(TOTAL_H100_WATT, total_time) * h100s_per_step
        
        data_elec["Model Size (Params)"].append(convert_to_xt_format(model_size))
        data_elec["Dataset Size (Tokens)"].append(convert_to_xt_format(dataset_size))
        data_elec["Number of GPUs"].append(h100s_per_step)
        data_elec["Total electricity per step (without grad accum)"].append(convert_watts_to_megawatts(h100s_per_step * TOTAL_H100_WATT))
        data_elec["Total electricity for the entire training (without grad accum)"].append(f"{convert_watts_to_terawatts(total_electricity_consumption)}")
    
    df = pd.DataFrame(data_elec)
    df['Global Batch Size'] = f'{global_batch_size/1e6}M'
    dataframes_elec.append(df)

final_df_elec = pd.DataFrame()
for i, df in enumerate(dataframes_elec):
    final_df_elec = pd.concat([final_df_elec, df])

In [11]:
# my calculation closes to 100k gpu cluster's electricity: https://semianalysis.com/2024/06/17/100000-h100-clusters-power-network/#power-challenges

In [12]:
final_df_elec

Unnamed: 0,Model Size (Params),Dataset Size (Tokens),Number of GPUs,Total electricity per step (without grad accum),Total electricity for the entire training (without grad accum),Global Batch Size
0,0.1T,2.0T,2206.0,2.813 MW,2.812650000000 TW,2.0M
1,0.5T,10.0T,11030.0,14.063 MW,70.316250000000 TW,2.0M
2,1.0T,20.0T,22060.0,28.127 MW,281.265000000000 TW,2.0M
3,10.0T,200.0T,220608.0,281.275 MW,28127.520000000000 TW,2.0M
4,100.0T,2000.0T,2206085.0,2812.758 MW,2812758.375000000000 TW,2.0M
0,0.1T,2.0T,17648.0,22.501 MW,2.812650000000 TW,16.0M
1,0.5T,10.0T,88243.0,112.510 MW,70.318640625000 TW,16.0M
2,1.0T,20.0T,176486.0,225.020 MW,281.274562500000 TW,16.0M
3,10.0T,200.0T,1764868.0,2250.207 MW,28127.583750000002 TW,16.0M
4,100.0T,2000.0T,17648680.0,22502.067 MW,2812758.375000000000 TW,16.0M
