In [6]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
df_eval = pd.read_csv("EVAL_RESULTS_ModelConfig.csv")

In [8]:
def get_flops_hoffman(
    embedding_size,
    hidden_size,
    intermediate_size,
    num_attention_heads,
    num_hidden_layers,
    #num_training_seq=9081600,
    vocab_size=19010,
    seq_len=128,
):
    #
    key_size = hidden_size / num_attention_heads
    
    # Embeddings
    flops_emb = 2 * seq_len * vocab_size * embedding_size
    flops_emb += (2 * seq_len * embedding_size * hidden_size)
    
    # 1 attention block (QKV projection + K@Q logits + Softmax + Softmax query reduction + final linear)
    flops_attention = (2 * 3 * seq_len * hidden_size * (key_size * num_attention_heads))
    flops_attention += (2 * seq_len * seq_len * (key_size * num_attention_heads))
    flops_attention += (3 * num_attention_heads * seq_len * seq_len)
    
    # 1 intermediate layer
    flops_intermediate = (2 * seq_len * (hidden_size * intermediate_size + hidden_size * intermediate_size))
    
    # LM head
    flops_logits = (2 * seq_len * hidden_size * vocab_size)
    
    # Total flops for forward pass
    flops_forward = flops_emb + (num_hidden_layers * (flops_attention + flops_intermediate)) + flops_logits
    
    # As per the assumption in Kaplan and Hoffman
    flops_backward = 2 * flops_forward
    
    return (flops_forward + flops_backward)

def get_flops_kaplan(
    num_non_emb_pars,
    seq_len=128,
    #num_training_seq=9081600,
):
    return (6 * num_non_emb_pars * (seq_len))

In [9]:
df_eval

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,eval/perplexity,eval/loss,eval/step,eval/epoch,eval/batch_idx,eval/updates,test/perplexity,test/loss,Embedding parameters,Non-embedding parameters,Total parameters,run_name,embedding_size,hidden_size,intermediate_size,num_attention_heads,num_hidden_layers
0,0,0,26.517265,3.277796,2000,0,1999,1000,5.600979,1.722941,612608.0,11278914.0,11891522.0,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0
1,1,1,24.424155,3.195573,4000,0,3999,2000,5.600979,1.722941,612608.0,11278914.0,11891522.0,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0
2,2,2,22.314097,3.105219,6000,0,5999,3000,5.600979,1.722941,612608.0,11278914.0,11891522.0,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0
3,3,3,13.612602,2.610996,8000,0,7999,4000,5.600979,1.722941,612608.0,11278914.0,11891522.0,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0
4,4,4,10.610475,2.361842,10000,0,9999,5000,5.600979,1.722941,612608.0,11278914.0,11891522.0,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,67,67,5.705392,1.741412,32000,0,31999,32000,5.610025,1.724555,4900864.0,2726210.0,7627074.0,major-smoke-6,256.0,256.0,128.0,8.0,8.0
572,68,68,5.698034,1.740121,33000,0,32999,33000,5.610025,1.724555,4900864.0,2726210.0,7627074.0,major-smoke-6,256.0,256.0,128.0,8.0,8.0
573,69,69,5.660107,1.733443,34000,0,33999,34000,5.610025,1.724555,4900864.0,2726210.0,7627074.0,major-smoke-6,256.0,256.0,128.0,8.0,8.0
574,70,70,5.628158,1.727782,35000,0,34999,35000,5.610025,1.724555,4900864.0,2726210.0,7627074.0,major-smoke-6,256.0,256.0,128.0,8.0,8.0


In [10]:
set_ = set([
    "clean-sky-2",
    "polar-sponge-3",
    "absurd-waterfall-5",
    "genial-voice-1",
    "major-smoke-6",
    "honest-fog-13",
    "prime-pine-15",
])

#
for idx in tqdm(df_eval.index):
    run_name = df_eval.loc[idx, "run_name"]
    
    #
    if run_name in set_:
        num_non_emb_pars = df_eval.loc[idx, "Non-embedding parameters"]
        num_non_emb_pars += (df_eval.loc[idx, "embedding_size"] * df_eval.loc[idx, "hidden_size"])
    else:
        num_non_emb_pars = df_eval.loc[idx, "Non-embedding parameters"]
    
    # Hoffman flops per sequence
    hoffman = get_flops_hoffman(
        embedding_size=df_eval.loc[idx, "embedding_size"],
        hidden_size=df_eval.loc[idx, "hidden_size"],
        intermediate_size=df_eval.loc[idx, "intermediate_size"],
        num_attention_heads=df_eval.loc[idx, "num_attention_heads"],
        num_hidden_layers=df_eval.loc[idx, "num_hidden_layers"],
    )
    
    # Kaplan flops per sequence
    kaplan = get_flops_kaplan(
        num_non_emb_pars,
        seq_len=128,
        #num_training_seq=9081600,
    )
    
    #
    total_hoffman = hoffman * df_eval.loc[idx, "eval/updates"] * 256
    total_kaplan = kaplan * df_eval.loc[idx, "eval/updates"] * 256
    
    #
    df_eval.loc[idx, "FLOPS Hoffman per sequence"] = hoffman
    df_eval.loc[idx, "FLOPS Hoffman total"] = total_hoffman
    df_eval.loc[idx, "FLOPS Kaplan per sequence"] = kaplan
    df_eval.loc[idx, "FLOPS Kaplan total"] = total_kaplan

#
df_eval

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 576/576 [00:00<00:00, 1062.11it/s]


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,eval/perplexity,eval/loss,eval/step,eval/epoch,eval/batch_idx,eval/updates,test/perplexity,test/loss,...,run_name,embedding_size,hidden_size,intermediate_size,num_attention_heads,num_hidden_layers,FLOPS Hoffman per sequence,FLOPS Hoffman total,FLOPS Kaplan per sequence,FLOPS Kaplan total
0,0,0,26.517265,3.277796,2000,0,1999,1000,5.600979,1.722941,...,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0,8.850948e+09,2.265843e+15,8.662206e+09,2.217525e+15
1,1,1,24.424155,3.195573,4000,0,3999,2000,5.600979,1.722941,...,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0,8.850948e+09,4.531685e+15,8.662206e+09,4.435049e+15
2,2,2,22.314097,3.105219,6000,0,5999,3000,5.600979,1.722941,...,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0,8.850948e+09,6.797528e+15,8.662206e+09,6.652574e+15
3,3,3,13.612602,2.610996,8000,0,7999,4000,5.600979,1.722941,...,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0,8.850948e+09,9.063371e+15,8.662206e+09,8.870099e+15
4,4,4,10.610475,2.361842,10000,0,9999,5000,5.600979,1.722941,...,twilight-wildflower-11,32.0,256.0,1024.0,8.0,8.0,8.850948e+09,1.132921e+16,8.662206e+09,1.108762e+16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,67,67,5.705392,1.741412,32000,0,31999,32000,5.610025,1.724555,...,major-smoke-6,256.0,256.0,128.0,8.0,8.0,9.346744e+09,7.656853e+16,2.144061e+09,1.756415e+16
572,68,68,5.698034,1.740121,33000,0,32999,33000,5.610025,1.724555,...,major-smoke-6,256.0,256.0,128.0,8.0,8.0,9.346744e+09,7.896130e+16,2.144061e+09,1.811303e+16
573,69,69,5.660107,1.733443,34000,0,33999,34000,5.610025,1.724555,...,major-smoke-6,256.0,256.0,128.0,8.0,8.0,9.346744e+09,8.135406e+16,2.144061e+09,1.866191e+16
574,70,70,5.628158,1.727782,35000,0,34999,35000,5.610025,1.724555,...,major-smoke-6,256.0,256.0,128.0,8.0,8.0,9.346744e+09,8.374683e+16,2.144061e+09,1.921079e+16


In [13]:
df_eval.loc[:, 'Tokens'] = 256 * 128 * df_eval.loc[:, 'eval/updates']

In [14]:
df_eval.to_csv("EVAL_RESULTS_ModelConfig.csv")