# Verify the cost model fidelity here
- notice: the code is tested on A800, thus didn't distributed
- if tested on smaller GPU with lower CPU capability, pls check the examples to load in numpy

In [16]:
import torch
# sample batch size from [4, 8, 16]
# sample prompt length from [128, 512]
# sample maximum generated token from [100, 200]
# sample precision from [3, 4, '8:tc-li', 16]
batch_sizes = [2, 4, 8]
prompt_lengths = [128, 512]
generated_tokens = [100, 200]
precision_candidates = [3, 4, '8:tc-li', 16]
# set cuda device
cuda_device_num = 3
# torch.cuda.set_device(cuda_device_num)
device = torch.device(f"cuda:{cuda_device_num}" if torch.cuda.is_available() else "cpu")

In [2]:
from qllm.models import create_model_config
# sample cases
# model
models = [
    ['bloom', '560m'],
    ['bloom', '1b7'],
    ['opt', '13b'],
    ['opt', '30b'],
    ['opt', '66b']
]
model_configs = [
    create_model_config(model_name, model_size) for (model_name, model_size) in models
]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [3]:
import random
from qllm.utils import create_single_node_sharding_strategies_with_precision_specs
def generate_configurations(model_configs):
    configs = {}
    for idx, config in enumerate(model_configs):
        # get layer number
        if hasattr(config, 'n_layer'):
            dec_layers = config.n_layer
        elif hasattr(config,"num_hidden_layers"):
            dec_layers = config.num_hidden_layers
        else:
            raise ValueError("Not implemented")
        batch_size = random.choice(batch_sizes)
        prompt_length = random.choice(prompt_lengths)
        generated_token = random.choice(generated_tokens)
        layer_precs = [
            random.choice(precision_candidates) for _ in range(dec_layers)
        ]
        layer_precs = create_single_node_sharding_strategies_with_precision_specs(dec_layers, layer_precs)
        
        configs[idx] = [batch_size, prompt_length, generated_token, layer_precs]
    
    return configs


In [6]:
# sample from 
mem_sample_configs = generate_configurations(model_configs)

In [7]:
# check the generated configs
mem_sample_configs

{0: [4,
  512,
  200,
  {0: {0: {'shard': [0, 1], 'bits': ['8:tc-li', '8:tc-li']},
    1: {'shard': [0, 1], 'bits': [4, 4]},
    2: {'shard': [0, 1], 'bits': [16, 16]},
    3: {'shard': [0, 1], 'bits': [4, 4]},
    4: {'shard': [0, 1], 'bits': ['8:tc-li', '8:tc-li']},
    5: {'shard': [0, 1], 'bits': [4, 4]},
    6: {'shard': [0, 1], 'bits': [4, 4]},
    7: {'shard': [0, 1], 'bits': ['8:tc-li', '8:tc-li']},
    8: {'shard': [0, 1], 'bits': ['8:tc-li', '8:tc-li']},
    9: {'shard': [0, 1], 'bits': [4, 4]},
    10: {'shard': [0, 1], 'bits': [4, 4]},
    11: {'shard': [0, 1], 'bits': [16, 16]},
    12: {'shard': [0, 1], 'bits': [16, 16]},
    13: {'shard': [0, 1], 'bits': [4, 4]},
    14: {'shard': [0, 1], 'bits': [3, 3]},
    15: {'shard': [0, 1], 'bits': ['8:tc-li', '8:tc-li']},
    16: {'shard': [0, 1], 'bits': [3, 3]},
    17: {'shard': [0, 1], 'bits': [16, 16]},
    18: {'shard': [0, 1], 'bits': [3, 3]},
    19: {'shard': [0, 1], 'bits': [3, 3]},
    20: {'shard': [0, 1], 'bits': [4,

In [43]:
# check the memory occupation
from qllm.models import create_empty_model, bare_load_pretrained_from_size
from qllm.utils import get_model_size_cuda, ModelMemEstimator, get_iter_variable_size
import lptorch
def check_mem_occupation(model_name_size_pair, model_config, mem_sample_config):
    # create empty model is enough to estimate.
    # load weight if you want.
    model_name, model_size = model_name_size_pair
    model = create_empty_model(model_name, model_size)
    # model =  bare_load_pretrained_from_size(model_name, model_size)
    caliber = lptorch.inner_caliber
    caliber.set_model(model)
    caliber.set_fake()
    caliber.load_fake_calib_data(f'./fake_calib_{model_name}_{model_size}.pkl')
    
    batch_size, prompt_length, generated_token, sharding_strategy = mem_sample_config
    # shard model
    model_pre_and_post = model._pure_pre_and_post()
    model = model.float() # some op need fp32 to make quantization
    model = model.shard_model(sharding_strategy, 0) # generated shard strategy is single node
    
    # move model to cuda
    model_pre_and_post = model_pre_and_post.cuda()
    model.decoder_layers_to_device(device)
    # init kv
    model.init_kv_cache(batch_size, prompt_length, generated_token, request_id=1)

    # get estimated size
    if model_name == 'bloom':
        h1 = model_config.hidden_size
        vocab_size = model_config.vocab_size
        max_position_embeddings = 0
        word_embed_proj_dim = h1
        h2 = h1 * 4
        
    elif model_name == 'opt':
        h1 = model_config.hidden_size
        h2 = model_config.ffn_dim
        vocab_size = model_config.vocab_size
        max_position_embeddings = model_config.max_position_embeddings
        word_embed_proj_dim = model_config.word_embed_proj_dim
        
    model_mem_estimator = ModelMemEstimator(h1, h2, batch_size, prompt_length, generated_token, \
                                            vocab_size, max_position_embeddings, word_embed_proj_dim)
    
    # comparison
    # data rank
    print("Model pre and post size: ", get_model_size_cuda(model_pre_and_post, 'MB')[0] + get_model_size_cuda(model_pre_and_post.lm_head, 'MB')[0])
    print("Est Model pre and post size: ", model_mem_estimator.calculate_prepost_mem(unit='MB')[1])
    # model
    print("Model size: ", get_model_size_cuda(model, 'MB')[1])
    print("Estimated Model", model_mem_estimator.calculate_model_occupation_of_partition(sharding_strategy[0], unit='MB')[1])
    # KV size
    request_num = 1 # allow more request nums 
    if model_name == 'bloom':
        print("Model KV size: ", get_iter_variable_size(model.transformer.get_all_kv_cache_dict(), unit='MB'))
        print("Estimated Model KV:", request_num * model_mem_estimator.calculate_kv_occupation_of_partition(sharding_strategy[0], 'MB')[0])
    elif model_name == 'opt':
        print("Model KV size: ", get_iter_variable_size(model.model.decoder.get_all_kv_cache_dict(), unit='MB'))
        print("Estimated Model KV:", request_num * model_mem_estimator.calculate_kv_occupation_of_partition(sharding_strategy[0], 'MB')[0])
    

In [39]:
def check_mem_est(check_idx):
    model_name_size_pair = models[check_idx]
    model_config = model_configs[check_idx]
    mem_sample_config = mem_sample_configs[check_idx]
    check_mem_occupation(model_name_size_pair, model_config, mem_sample_config)

In [21]:
# Generate some fake calibaration data 
!python3 fake_calib_sample.py --model-name bloom --model-size 560m
!python3 fake_calib_sample.py --model-name bloom --model-size 1b7
!python3 fake_calib_sample.py --model-name opt --model-size 13b
!python3 fake_calib_sample.py --model-name opt --model-size 30b
!python3 fake_calib_sample.py --model-name opt --model-size 66b


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this inf


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [44]:
# select the check_idx
check_idx = 0
check_mem_est(check_idx)

Model pre and post size:  980.0078125
Est Model pre and post size:  980.00390625 MB
Model size:  Total size of the model: 265.64 MB
Estimated Model 264.09375 MB
Model KV size:  267.0
Estimated Model KV: 267.0


In [45]:
check_idx = 1
check_mem_est(check_idx)

Model pre and post size:  1960.015625
Est Model pre and post size:  1960.0078125 MB
Model size:  Total size of the model: 1089.24 MB
Estimated Model 1086.1875 MB
Model KV size:  492.0
Estimated Model KV: 492.0


In [None]:
check_idx = 2
check_mem_est(check_idx)

In [None]:
check_idx = 3
check_mem_est(check_idx)