In [1]:
from utils import (
    get_dataset_size_from_model_size, calculate_total_steps, calculate_total_flops, calculate_flops_per_step,
    calculate_num_h100s_per_step, calculate_total_time_to_train_a_model,
    compute_minimum_latency_between_clusters, calculate_total_minimum_comm_latency_to_train_a_model
)
from utils import (
    convert_to_petaflops, convert_to_exaflops, convert_seconds_to_days,
    convert_to_xt_format, convert_to_million_format, convert_seconds_to_years
)
from constants import UTILIZED_BFLOAT16_FLOPS, H100_COST_PER_HOUR
import pandas as pd
import numpy as np

#### Compute

In [2]:
# 100b to 100T
target_model_sizes = [100*10**9, 500*10**9, 1000*10**9, 10000*10**9, 100000*10**9]
global_batch_sizes = [x*10**6 for x in [2, 16, 40]]

In [8]:
time_per_step = 1 # the total time of a fwd, and bwd pass

In [5]:
dataframes_compute = []

for global_batch_size in global_batch_sizes:
    data_compute = {
        "Model Size (Params)": [],
        "Dataset Size (Tokens)": [],
        "Total Steps": [],
        "Total FLOPs": [],
        "FLOPs per Step": [],
        "H100 GPUs Needed": [],
        "Total Training Time without grad accum": [],
        "Total Training Time with 10 grad accum": [],
        "Total Training Time with 100 grad accum": [],
        "Total Training Time with 1000 grad accum": []
    }
    
    for model_size in target_model_sizes:
        dataset_size = get_dataset_size_from_model_size(model_size)
        total_steps = calculate_total_steps(model_size, global_batch_size)
        total_flops = calculate_total_flops(model_size)
        flops_per_step = calculate_flops_per_step(model_size, global_batch_size)
        h100s_per_step = calculate_num_h100s_per_step(model_size, global_batch_size, UTILIZED_BFLOAT16_FLOPS)
        total_time = calculate_total_time_to_train_a_model(model_size, global_batch_size, time_per_step)
        
        data_compute["Model Size (Params)"].append(convert_to_xt_format(model_size))
        data_compute["Dataset Size (Tokens)"].append(convert_to_xt_format(dataset_size))
        data_compute["Total Steps"].append(total_steps)
        data_compute["Total FLOPs"].append(convert_to_exaflops(total_flops))
        data_compute["FLOPs per Step"].append(convert_to_petaflops(flops_per_step))
        data_compute["H100 GPUs Needed"].append(h100s_per_step)
        data_compute["Total Training Time without grad accum"].append(convert_seconds_to_days(total_time))
        data_compute["Total Training Time with 10 grad accum"].append(f"{convert_seconds_to_years(total_time*10)} - {h100s_per_step/10} gpus")
        data_compute["Total Training Time with 100 grad accum"].append(f"{convert_seconds_to_years(total_time*100)} - {h100s_per_step/100} gpus")
        data_compute["Total Training Time with 1000 grad accum"].append(f"{convert_seconds_to_years(total_time*1000)} - {h100s_per_step/1000} gpus")
    
    df = pd.DataFrame(data_compute)
    # Add batch size information
    df['Global Batch Size'] = f'{global_batch_size/1e6}M'
    dataframes_compute.append(df)

final_df_compute = pd.DataFrame()
for i, df in enumerate(dataframes_compute):
    final_df_compute = pd.concat([final_df_compute, df])

In [7]:
final_df_compute

Unnamed: 0,Model Size (Params),Dataset Size (Tokens),Total Steps,Total FLOPs,FLOPs per Step,H100 GPUs Needed,Total Training Time without grad accum,Total Training Time with 10 grad accum,Total Training Time with 100 grad accum,Total Training Time with 1000 grad accum,Global Batch Size
0,0.1T,2.0T,1000000,1200000.000 EFLOPs,1200.000 PFLOPs,2206.0,11.574 days,0.317 years - 220.6 gpus,3.169 years - 22.06 gpus,31.688 years - 2.206 gpus,2.0M
1,0.5T,10.0T,5000000,30000000.000 EFLOPs,6000.000 PFLOPs,11030.0,57.870 days,1.584 years - 1103.0 gpus,15.844 years - 110.3 gpus,158.440 years - 11.03 gpus,2.0M
2,1.0T,20.0T,10000000,120000000.000 EFLOPs,12000.000 PFLOPs,22060.0,115.741 days,3.169 years - 2206.0 gpus,31.688 years - 220.6 gpus,316.881 years - 22.06 gpus,2.0M
3,10.0T,200.0T,100000000,12000000000.000 EFLOPs,120000.000 PFLOPs,220608.0,1157.407 days,31.688 years - 22060.8 gpus,316.881 years - 2206.08 gpus,3168.809 years - 220.608 gpus,2.0M
4,100.0T,2000.0T,1000000000,1200000000000.000 EFLOPs,1200000.000 PFLOPs,2206085.0,11574.074 days,316.881 years - 220608.5 gpus,3168.809 years - 22060.85 gpus,31688.088 years - 2206.085 gpus,2.0M
0,0.1T,2.0T,125000,1200000.000 EFLOPs,9600.000 PFLOPs,17648.0,1.447 days,0.040 years - 1764.8 gpus,0.396 years - 176.48 gpus,3.961 years - 17.648 gpus,16.0M
1,0.5T,10.0T,625000,30000000.000 EFLOPs,48000.000 PFLOPs,88243.0,7.234 days,0.198 years - 8824.3 gpus,1.981 years - 882.43 gpus,19.805 years - 88.243 gpus,16.0M
2,1.0T,20.0T,1250000,120000000.000 EFLOPs,96000.000 PFLOPs,176486.0,14.468 days,0.396 years - 17648.6 gpus,3.961 years - 1764.86 gpus,39.610 years - 176.486 gpus,16.0M
3,10.0T,200.0T,12500000,12000000000.000 EFLOPs,960000.000 PFLOPs,1764868.0,144.676 days,3.961 years - 176486.8 gpus,39.610 years - 17648.68 gpus,396.101 years - 1764.868 gpus,16.0M
4,100.0T,2000.0T,125000000,1200000000000.000 EFLOPs,9600000.000 PFLOPs,17648680.0,1446.759 days,39.610 years - 1764868.0 gpus,396.101 years - 176486.8 gpus,3961.011 years - 17648.68 gpus,16.0M


#### Communication

Assumptions on communication
- No limit on banwidth
- Achieve speed of light
- Clostest surface distance between two points on the earth surface (assume you don't dig a crazy hole to go a straight line)

In [6]:
minimum_latency_between_jz_and_jc = compute_minimum_latency_between_clusters("JEAN_ZAY", "JOLIOT_CURIE")
minimum_latency_between_jz_and_ec = compute_minimum_latency_between_clusters("JEAN_ZAY", "EL_CAPITAN")

In [9]:
dataframes_comm = []

for global_batch_size in global_batch_sizes:
    data_comm = {
        "Model Size (Params)": [],
        "Dataset Size (Tokens)": [],
        
        "Total minimum communication latency between JZ and JC": [],
        "Total GPU idle time for minimum comm between JZ and JC": [],
        "GPU idle cost during JZ-JC minimum communication latency": [],
        
        "Total minimum communication latency between JZ and EC": [],
        "Total GPU idle time for minimum comm between JZ and EC": [],
        "GPU idle cost during JZ-EC minimum communication latency": [],
    }
    
    for model_size in target_model_sizes:
        dataset_size = get_dataset_size_from_model_size(model_size)
        # Append to dictionary
        data_comm["Model Size (Params)"].append(convert_to_xt_format(model_size))
        data_comm["Dataset Size (Tokens)"].append(convert_to_xt_format(dataset_size))
        
        data_comm["Total minimum communication latency between JZ and JC"].append(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc))
        data_comm["Total GPU idle time for minimum comm between JZ and JC"].append(convert_seconds_to_days(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc) * h100s_per_step))
        data_comm["GPU idle cost during JZ-JC minimum communication latency"].append(convert_to_million_format((calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_jc) / (60 * 60)) * h100s_per_step * H100_COST_PER_HOUR))
        
        data_comm["Total minimum communication latency between JZ and EC"].append(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec))
        data_comm["Total GPU idle time for minimum comm between JZ and EC"].append(convert_seconds_to_years(calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec) * h100s_per_step))
        data_comm["GPU idle cost during JZ-EC minimum communication latency"].append(convert_to_million_format((calculate_total_minimum_comm_latency_to_train_a_model(model_size, global_batch_size, minimum_latency_between_jz_and_ec) / (60 * 60)) * h100s_per_step * H100_COST_PER_HOUR))
    
    # Convert to DataFrame
    df_comm = pd.DataFrame(data_comm)
    df_comm['Global Batch Size'] = f'{global_batch_size/1e6}M'
    dataframes_comm.append(df_comm)

final_df_comm = pd.DataFrame()
for i, df in enumerate(dataframes_comm):
    final_df_comm = pd.concat([final_df_comm, df])

In [10]:
final_df_comm

Unnamed: 0,Model Size (Params),Dataset Size (Tokens),Total minimum communication latency between JZ and JC,Total GPU idle time for minimum comm between JZ and JC,GPU idle cost during JZ-JC minimum communication latency,Total minimum communication latency between JZ and EC,Total GPU idle time for minimum comm between JZ and EC,GPU idle cost during JZ-EC minimum communication latency,Global Batch Size
0,0.1T,2.0T,1.27827,652.772 days,0.0m,29.790336,41.651 years,0.7m,2.0M
1,0.5T,10.0T,6.391352,3263.858 days,0.2m,148.951681,208.254 years,3.7m,2.0M
2,1.0T,20.0T,12.782705,6527.716 days,0.3m,297.903362,416.508 years,7.3m,2.0M
3,10.0T,200.0T,127.827046,65277.162 days,3.1m,2979.033618,4165.083 years,73.0m,2.0M
4,100.0T,2000.0T,1278.270459,652771.624 days,31.3m,29790.336181,41650.833 years,730.2m,2.0M
0,0.1T,2.0T,0.159784,81.596 days,0.0m,3.723792,5.206 years,0.1m,16.0M
1,0.5T,10.0T,0.798919,407.982 days,0.0m,18.61896,26.032 years,0.5m,16.0M
2,1.0T,20.0T,1.597838,815.965 days,0.0m,37.23792,52.064 years,0.9m,16.0M
3,10.0T,200.0T,15.978381,8159.645 days,0.4m,372.379202,520.635 years,9.1m,16.0M
4,100.0T,2000.0T,159.783807,81596.453 days,3.9m,3723.792023,5206.354 years,91.3m,16.0M
