# Load a french dataset

In [None]:
pip install datasets

In [16]:
from datasets import load_dataset

dataset_path = "frenchtext/banque-fr-2311"

In [None]:
dataset = load_dataset(dataset_path)

In [3]:
dataset["valid"][0]["Text"]

"# Les nouvelles normes européennes sur le paiement pourraient affecter l'e-commerce\r\n\r\nicone ecommerce\r\n\r\nLes nouvelles règles européennes sur la sécurisation des paiements en ligne entreront en vigueur à partir du 14 septembre 2019. Elles ont notamment été pensées pour limiter les fraudes dans le domaine. Les banques et les acteurs du secteur interpellent toutefois les autorités sur les perturbations pouvant être induites par le déploiement de cette nouvelle norme.\r\n\r\nLes plateformes d'e-commerce sont généralement débordées en fin d’année, notamment avec Thanksgiving, le Black Friday et les achats de Noël. Cette période s’annonce encore plus compliquée pour 2019.\r\n\r\nEn effet, les nouvelles normes de sécurité pour le paiement en ligne seront appliquées en Europe à compter de mi-septembre. Elles concerneront notamment les banques, les fournisseurs de services de paiement et les e-commerçants.\r\n\r\nAu regard de cette échéance, les protagonistes de ces différents secteu

# Get popular tokenizers

In [None]:
pip install --upgrade transformers

In [None]:
pip install sentencepiece

In [None]:
pip install tiktoken

In [1]:
models = { 
    #"rmkv_world_1b5" : "BlinkDL/rwkv-5-world",

    "phi2_3b" : "microsoft/phi-2",
    "btlm_3b" : "cerebras/btlm-3b-8k-base",
    "redpajama_3b" : "togethercomputer/RedPajama-INCITE-Base-3B-v1",
    "open_llama_3b" : "openlm-research/open_llama_3b_v2",
    "stablelm_3b" : "stabilityai/stablelm-3b-4e1t",      
    
    "yi_6b" : "01-ai/Yi-6B",
    "mistral_7b" : "mistralai/Mistral-7B-v0.1",
    "mpt_7b" : "mosaicml/mpt-7b",
    "falcon_7b" : "tiiuae/falcon-7b",
    "redpajama_7b" : "togethercomputer/RedPajama-INCITE-7B-Base",
    "llama2_7b_32k" : "togethercomputer/LLaMA-2-7B-32K",
    "open_llama_7b" : "openlm-research/open_llama_7b_v2",
    "mpt_7b_8k" : "mosaicml/mpt-7b-8k",
    "qwen_7b" : "Qwen/Qwen-7B",
    "llama2_7b" : "meta-llama/Llama-2-7b-hf",
    "bloomz_7b" : "bigscience/bloomz-7b1-mt",
    "decilm_7b" : "Deci/DeciLM-7B",
    
    "solar_10b" : "upstage/SOLAR-10.7B-v1.0",
    
    "llama2_13b" : "meta-llama/Llama-2-13b-hf",
    "qwen_14b" : "Qwen/Qwen-14B",
    
    "mixtral_8x7B" : "mistralai/Mixtral-8x7B-v0.1",
    
    "mpt_30b" : "mosaicml/mpt-30b",
    "yi_34b" : "01-ai/Yi-34B",
    "falcon_40b" : "tiiuae/falcon-40b"
}

In [3]:
with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

In [31]:
tokenizer = AutoTokenizer.from_pretrained( "Qwen/Qwen-7B", trust_remote_code=True, token=myhftoken)

In [29]:
from transformers import AutoTokenizer
import json

models_tokenizers = {}
models_tokenizers_config = {}
models_tokenizers_specialtokens = {}
for model in models.keys():
    print("------------------------")
    print(f"Loading {model} tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(models[model], trust_remote_code=True, token=myhftoken)
    models_tokenizers[model] = tokenizer
    if model[:5]=="qwen_":
        print(hash(tuple(sorted(tokenizer.mergeable_ranks.items()))))
    else:
        print(hash(tuple(sorted(tokenizer.vocab.items()))))
    config = {}
    specialtokens = {}
    config["type"] = type(tokenizer)
    if model[:5]=="qwen_":
         type(tokenizer.tokenizer)
    else:
        config["backend_type"] = type(tokenizer.backend_tokenizer.model)
    config["vocab_size"] = tokenizer.vocab_size
    config["model_max_length"] = tokenizer.model_max_length
    if hasattr(tokenizer, "special_tokens"): specialtokens["special_tokens"] = tokenizer.special_tokens
    config["padding_side"] = tokenizer.padding_side
    config["truncation_side"] = tokenizer.truncation_side
    config["clean_up_tokenization_spaces"] = tokenizer.clean_up_tokenization_spaces
    if tokenizer.is_fast:
        backend_config = json.loads(tokenizer.backend_tokenizer.to_str())
        if "vocab" in backend_config["model"]: del backend_config["model"]["vocab"]
        if "merges" in backend_config["model"]: del backend_config["model"]["merges"]
        config['truncation'] = backend_config['truncation']
        config['padding'] = backend_config['padding']
        specialtokens['added_tokens'] = backend_config['added_tokens']
        config['normalizer'] = backend_config['normalizer']
        config['pre_tokenizer'] = backend_config['pre_tokenizer']
        config['model'] = backend_config['model']
        config['post_processor'] = backend_config['post_processor']
        config['decoder'] = backend_config['decoder']
    elif model[:3]=="yi_":
        config['model'] = type(tokenizer.sp_model)
    models_tokenizers_config[model] = config
    models_tokenizers_specialtokens[model] = specialtokens
    print(config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


------------------------
Loading phi2_3b tokenizer
4837434508465586422
{'type': <class 'transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast'>, 'backend_type': <class 'tokenizers.models.BPE'>, 'vocab_size': 50257, 'model_max_length': 2048, 'padding_side': 'right', 'truncation_side': 'right', 'clean_up_tokenization_spaces': True, 'truncation': None, 'padding': None, 'normalizer': None, 'pre_tokenizer': {'type': 'ByteLevel', 'add_prefix_space': False, 'trim_offsets': True, 'use_regex': True}, 'model': {'type': 'BPE', 'dropout': None, 'unk_token': None, 'continuing_subword_prefix': '', 'end_of_word_suffix': '', 'fuse_unk': False, 'byte_fallback': False}, 'post_processor': {'type': 'ByteLevel', 'add_prefix_space': True, 'trim_offsets': False, 'use_regex': True}, 'decoder': {'type': 'ByteLevel', 'add_prefix_space': True, 'trim_offsets': True, 'use_regex': True}}
------------------------
Loading btlm_3b tokenizer
8272226667653940756
{'type': <class 'transformers.models.

# Test tokenizers on french dataset

In [39]:
tokenizer = models_tokenizers["falcon_7b"]

def tokenization(example):
    return tokenizer(example["Text"])

In [40]:
dataset = load_dataset(dataset_path, split="train+valid+test")
dataset = dataset.map(tokenization, batched=True)

Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

In [41]:
words = 0
tokens = 0

for example in dataset:
    words += example['Words']
    tokens += len(example['input_ids'])
    
words, tokens

(67061556, 131722778)

In [43]:
tokenizer.vocab_size, tokens/words

(65024, 1.9642070040844266)

In [17]:
dataset = load_dataset(dataset_path, split="train+valid+test")
    
for model in models:
    tokenizer = models_tokenizers[model]

    def tokenization(example):
        return tokenizer(example["Text"])
    
    dataset = dataset.map(tokenization, batched=True)
    
    words = 0
    tokens = 0
    for example in dataset:
        words += example['Words']
        tokens += len(example['input_ids'])
        
    print("------------------------")
    print(f"{model}: {tokenizer.vocab_size} vocab => {tokens/words} tokens per word")

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3094 > 2048). Running this sequence through the model will result in indexing errors


------------------------
phi2_3b: 50257 vocab => 2.329939242686227 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (10506 > 8192). Running this sequence through the model will result in indexing errors


------------------------
btlm_3b: 50257 vocab => 2.340823750048388 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
redpajama_3b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3326 > 2048). Running this sequence through the model will result in indexing errors


------------------------
open_llama_3b: 32000 vocab => 2.4012566752850173 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
stablelm_3b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4374 > 4096). Running this sequence through the model will result in indexing errors


------------------------
yi_6b: 64000 vocab => 2.4612843758054166 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
mistral_7b: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
mpt_7b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3038 > 2048). Running this sequence through the model will result in indexing errors


------------------------
falcon_7b: 65024 vocab => 1.9642070040844266 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
redpajama_7b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (127882 > 32768). Running this sequence through the model will result in indexing errors


------------------------
llama2_7b_32k: 32000 vocab => 2.1736836675844504 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3326 > 2048). Running this sequence through the model will result in indexing errors


------------------------
open_llama_7b: 32000 vocab => 2.4012566752850173 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9937 > 8192). Running this sequence through the model will result in indexing errors


------------------------
mpt_7b_8k: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (112350 > 32768). Running this sequence through the model will result in indexing errors


------------------------
qwen_7b: 151851 vocab => 1.8874039844825552 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
llama2_7b: 32000 vocab => 2.1749545745702648 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
bloomz_7b: 250680 vocab => 1.4450713759161806 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
decilm_7b: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
solar_10b: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
llama2_13b: 32000 vocab => 2.1749545745702648 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (10460 > 8192). Running this sequence through the model will result in indexing errors


------------------------
qwen_14b: 151851 vocab => 1.8874039844825552 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
mixtral_8x7B: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9937 > 8192). Running this sequence through the model will result in indexing errors


------------------------
mpt_30b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4374 > 4096). Running this sequence through the model will result in indexing errors


------------------------
yi_34b: 64000 vocab => 2.4612843758054166 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3038 > 2048). Running this sequence through the model will result in indexing errors


------------------------
falcon_40b: 65024 vocab => 1.9642070040844266 tokens per word


# Train a tokenizer on french dataset

In [10]:
def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["Text"]

In [17]:
# Basic byte-level BPE
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
tokenizer = Tokenizer(models.BPE())
# tokenizer.normalizer = None
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
    vocab_size=32000,
    min_frequency=100,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
    show_progress=True
)

In [18]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(dataset))






In [None]:
def tokenization(examples):
    return {'input_ids': [enc.ids for enc in tokenizer.encode_batch(examples["Text"])]}

dataset = dataset.map(tokenization, batched=True)

words = 0
tokens = 0
for example in dataset:
    words += example['Words']
    tokens += len(example['input_ids'])

print("------------------------")
print(f"custom: {tokens/words} tokens per word")

In [38]:
print(f"custom: {tokens/words} tokens per word")

custom: 1.4874704070391687 tokens per word


In [None]:
[token for token in tokenizer.get_vocab().keys() if len(token)>=10]

In [45]:
from collections import Counter

tokens_counts = Counter()

for example in dataset:
    tokens_counts.update(example['input_ids'])

In [47]:
len(tokens_counts)

31877

In [51]:
len(tokens_counts) - len([{tokenizer.decode([token]):tokens_counts[token]} for token in tokens_counts.keys() if tokens_counts[token]<100])

30087

In [52]:
 len([{tokenizer.decode([token]):tokens_counts[token]} for token in tokens_counts.keys() if tokens_counts[token]<100])

1790

# Test model perplexity

In [None]:
pip install --upgrade accelerate bitsandbytes

In [2]:
models = { 
    #"rmkv_world_1b5" : "BlinkDL/rwkv-5-world",

    "btlm_3b" : "cerebras/btlm-3b-8k-base",
    "redpajama_3b" : "togethercomputer/RedPajama-INCITE-Base-3B-v1",
    "open_llama_3b" : "openlm-research/open_llama_3b_v2",
    "stablelm_3b" : "stabilityai/stablelm-3b-4e1t",

    "yi_6b" : "01-ai/Yi-6B",
    "mistral_7b" : "mistralai/Mistral-7B-v0.1",
    "mpt_7b" : "mosaicml/mpt-7b",
    "falcon_7b" : "tiiuae/falcon-7b",
    "redpajama_7b" : "togethercomputer/RedPajama-INCITE-7B-Base",
    "llama2_7b_32k" : "togethercomputer/LLaMA-2-7B-32K",
    "open_llama_7b" : "openlm-research/open_llama_7b_v2",
    "mpt_7b_8k" : "mosaicml/mpt-7b-8k",
    "qwen_7b" : "Qwen/Qwen-7B",
    "llama2_7b" : "meta-llama/Llama-2-7b-hf",
    "bloomz_7b" : "bigscience/bloomz-7b1-mt",

    "llama2_13b" : "meta-llama/Llama-2-13b-hf",
    "qwen_14b" : "Qwen/Qwen-14B",

    "mpt_30b" : "mosaicml/mpt-30b",
    "yi_34b" : "01-ai/Yi-34B",
    "falcon_40b" : "tiiuae/falcon-40b"
}

In [3]:
model_id = models["redpajama_3b"]

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)     

In [4]:
from datasets import load_dataset
dataset_path = "frenchtext/banque-fr-2311"
dataset = load_dataset(dataset_path, split="train+valid+test")

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

In [5]:
text_example = dataset[0]["Text"]
len(text_example)

5375

In [6]:
encodings = tokenizer(text_example, return_tensors="pt")

In [7]:
import torch

with torch.no_grad():
    tokenize_input = tokenizer.encode(text_example[:2048], return_tensors="pt")
    loss = model(tokenize_input, labels=tokenize_input)[0]
torch.exp(loss.float()).item()

2023-11-17 23:50:07.766211: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-17 23:50:07.786800: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


6.375988006591797

In [8]:
for i in range(10):
    print(len(dataset[i]["Text"]))

5375
3689
3804
2104
7936
3802
10669
2171
3568
1862


In [9]:
tokenizer.pad_token = tokenizer.eos_token

In [10]:
encodings = tokenizer(text = dataset[0:10]["Text"], add_special_tokens=True, 
                      padding="longest", truncation=True, return_overflowing_tokens=True, stride=int(tokenizer.model_max_length/2),
                      # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                      # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                      pad_to_multiple_of=16)

In [11]:
for sample_mapping,input_ids,attention_mask in zip(encodings["overflow_to_sample_mapping"],encodings["input_ids"],encodings["attention_mask"]):
    print(sample_mapping,len(input_ids),sum(attention_mask))  

0 2048 1630
1 2048 1160
2 2048 1162
3 2048 658
4 2048 2048
4 2048 1741
5 2048 1175
6 2048 2048
6 2048 2048
6 2048 1234
7 2048 648
8 2048 1072
9 2048 585


In [12]:
sorted_dataset = dataset.sort("Words").filter(lambda example: example["Words"]>0)

In [13]:
def get_dataset_batches(batch_size=32):
    dataset_length = len(sorted_dataset)
    for start_idx in range(0, dataset_length, batch_size):
        end_idx = min(start_idx + batch_size, dataset_length)
        yield sorted_dataset[start_idx:end_idx]

In [14]:
def encode_dataset_batch(dataset_batch):
    return tokenizer(text = dataset_batch["Text"], add_special_tokens=True, 
                      padding="longest", truncation=True, return_overflowing_tokens=True, stride=int(tokenizer.model_max_length/2),
                      # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                      # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                      pad_to_multiple_of=16,
                      return_tensors="pt")

In [15]:
for idx,dataset_batch in enumerate(get_dataset_batches()):
    encodings = encode_dataset_batch(dataset_batch)
    print(encodings["attention_mask"].sum(axis=1))
    print(encodings["input_ids"].size())
    if idx == 3: break

tensor([ 5,  4,  6,  5,  7, 23,  7,  7,  7,  7,  7, 10,  7,  7, 13,  6,  4,  6,
         6, 15,  4,  4,  4, 12, 12, 12, 12, 12, 12, 12, 12, 12])
torch.Size([32, 32])
tensor([12, 12, 12, 12, 12, 12, 12,  6, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12])
torch.Size([32, 16])
tensor([12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
         6, 12, 12, 12,  7, 12, 12, 12, 12, 12, 12, 12, 12, 47])
torch.Size([32, 48])
tensor([45, 45, 44, 48, 55, 39, 51, 46, 43, 42, 40, 47, 41, 44, 47, 42, 40, 62,
        43, 55, 34, 44, 42, 53, 53, 30, 56, 52, 51, 53, 37, 41])
torch.Size([32, 64])


In [None]:
from tqdm import tqdm

batch_size = 32

nlls = []
for idx,dataset_batch in enumerate(get_dataset_batches(batch_size)):
    
    encodings = encode_dataset_batch(dataset_batch)

    with torch.no_grad():
        outputs = model(encodings["input_ids"], attention_mask=encodings["attention_mask"], labels=encodings["input_ids"])
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)
    
    if idx%10==0: print(f"{(idx+1)*batch_size} / {len(dataset)}: {neg_log_likelihood}")

perplexity = torch.exp(torch.stack(nlls).mean().float())
print(perplexity)

15 min

32 / 85229: 7.71484375
352 / 85229: 7.09765625
672 / 85229: 8.1015625
992 / 85229: 5.12109375
1312 / 85229: 5.78125
1632 / 85229: 5.51953125
1952 / 85229: 6.12109375
2272 / 85229: 3.36328125
2592 / 85229: 5.26171875
2912 / 85229: 5.0703125
3232 / 85229: 4.87109375
3552 / 85229: 4.74609375
3872 / 85229: 4.046875
4192 / 85229: 4.3046875
4512 / 85229: 4.11328125
4832 / 85229: 4.56640625
5152 / 85229: 4.51171875
5472 / 85229: 4.359375
5792 / 85229: 4.32421875
6112 / 85229: 4.68359375
6432 / 85229: 4.6015625
6752 / 85229: 3.658203125
7072 / 85229: 4.7734375
7392 / 85229: 6.6328125
7712 / 85229: 3.869140625
8032 / 85229: 4.9375
8352 / 85229: 4.359375
8672 / 85229: 5.0078125
8992 / 85229: 4.09375
9312 / 85229: 3.513671875
9632 / 85229: 3.578125
9952 / 85229: 3.609375
10272 / 85229: 3.505859375
10592 / 85229: 2.99609375
10912 / 85229: 3.5859375
11232 / 85229: 3.18359375
11552 / 85229: 3.48046875
11872 / 85229: 3.11328125
12192 / 85229: 3.40234375
12512 / 85229: 3.189453125
12832 / 85229: 4.3515625
13152 / 85229: 3.306640625
13472 / 85229: 3.40234375
13792 / 85229: 3.162109375
14112 / 85229: 3.451171875
14432 / 85229: 2.845703125
14752 / 85229: 4.3515625
15072 / 85229: 4.2890625
15392 / 85229: 2.91015625
15712 / 85229: 3.1328125
16032 / 85229: 4.02734375
16352 / 85229: 3.126953125
16672 / 85229: 3.478515625
16992 / 85229: 3.193359375
17312 / 85229: 3.3203125
17632 / 85229: 3.0078125

OOM

In [17]:
perplexity = torch.exp(torch.stack(nlls).mean().float())
print(perplexity)

tensor(64.5841)


# Memory study

## CUDA helper functions

[CUDA semantics / Memory management](https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management)

[Understanding CUDA Memory Usage](https://pytorch.org/docs/stable/torch_cuda_memory.html#torch-cuda-memory)

[CUDA memory management API](https://pytorch.org/docs/stable/cuda.html#cuda-memory-management-api)

In [1]:
import torch
from datetime import datetime
from IPython.display import HTML
import pickle

memory_unit = 1024*1024
total_memory = torch.cuda.get_device_properties(0).total_memory

def display_memory():
    print(torch.cuda.get_device_name(0))
    print(f"Total    : {(total_memory/memory_unit):8,.1f} MB")
    print("------------------------------")
    free_memory = torch.cuda.mem_get_info()[0]
    reserved_memory = torch.cuda.memory_reserved(0)
    used_memory = torch.cuda.memory_allocated(0)    
    max_used_memory = torch.cuda.max_memory_allocated(0)
    overhead_memory = total_memory - free_memory - reserved_memory
    print(f"Overhead : {(overhead_memory/memory_unit):8,.1f} MB - {int(overhead_memory/total_memory*100):3} %")
    print(f"Reserved : {(reserved_memory/memory_unit):8,.1f} MB - {int(reserved_memory/total_memory*100):3} %")
    print(f"Free     : {(free_memory/memory_unit):8,.1f} MB - {int(free_memory/total_memory*100):3} %")
    print("------------------------------")
    print(f"Used     : {(used_memory/memory_unit):8,.1f} MB - {int(used_memory/total_memory*100):3} %")
    print(f"Max used : {(max_used_memory/memory_unit):8,.1f} MB - {int(max_used_memory/total_memory*100):3} %")
    
def display_memory_summary():
    print(torch.cuda.memory_summary())
    
def release_cached_memory():
    torch.cuda.empty_cache()
    
def reset_peak_memory_stats():
    torch.cuda.reset_peak_memory_stats()

def record_memory_history(enabled):
    torch.cuda.memory._record_memory_history(enabled=enabled)
    
def dump_memory_snapshot():
    filename_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"memory_snapshot_{filename_datetime}.pickle"
    s = torch.cuda.memory._snapshot(0)
    with open(filename, "wb") as f:
        pickle.dump(s, f)
    print(f"Dumped memory snapshot to file: {filename}")

# https://zdevito.github.io/2022/08/16/memory-snapshots.html
# https://zdevito.github.io/2022/12/09/memory-traces.html

def display_memory_snapshot():
    url = "https://pytorch.org/memory_viz"
    return HTML(f"Call dump_memory_snapshot(), <a href='{url}' target='_blank'>click here to open Pytorch memory viz</a>, then drag and drop the snapshot file")
    
display_memory()

NVIDIA GeForce RTX 4090
Total    : 24,563.5 MB
------------------------------
Overhead :  1,555.5 MB -   6 %
Reserved :      0.0 MB -   0 %
Free     : 23,008.0 MB -  93 %
------------------------------
Used     :      0.0 MB -   0 %
Max used :      0.0 MB -   0 %


In [6]:
record_memory_history(True)

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)   

In [19]:
dump_memory_snapshot()

Dumped memory snapshot to file: memory_snapshot_20231118_130500.pickle


In [20]:
display_memory_snapshot()

## Pytorch models exploration

In [7]:
from collections import OrderedDict

def find_attribute_origin(obj, attr_name):
    for cls in obj.__class__.__mro__:
        if attr_name in dir(cls):
            return cls.__name__
    return obj.__class__.__name__

def display_members(obj):
    obj_attributes = {}
    for member_name in dir(obj):
        if member_name[0:1]!="_":
            obj_attributes[getattr(obj,member_name).__qualname__ if hasattr(getattr(obj,member_name),"__qualname__") else f"{find_attribute_origin(obj,member_name)}.{member_name}"] = str(type(getattr(obj,member_name)))
    obj_attributes = {k: obj_attributes[k] for k in sorted(obj_attributes)}
    for member_name in obj_attributes.keys():
        print(member_name, obj_attributes[member_name])

In [82]:
display_members(model)

GPTNeoXConfig <class 'type'>
GPTNeoXForCausalLM.T_destination <class 'typing.TypeVar'>
GPTNeoXForCausalLM.base_model <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXModel'>
GPTNeoXForCausalLM.base_model_prefix <class 'str'>
GPTNeoXForCausalLM.call_super_init <class 'bool'>
GPTNeoXForCausalLM.config <class 'transformers.models.gpt_neox.configuration_gpt_neox.GPTNeoXConfig'>
GPTNeoXForCausalLM.device <class 'torch.device'>
GPTNeoXForCausalLM.dtype <class 'torch.dtype'>
GPTNeoXForCausalLM.dummy_inputs <class 'dict'>
GPTNeoXForCausalLM.dump_patches <class 'bool'>
GPTNeoXForCausalLM.embed_out <class 'torch.nn.modules.linear.Linear'>
GPTNeoXForCausalLM.forward <class 'functools.partial'>
GPTNeoXForCausalLM.framework <class 'str'>
GPTNeoXForCausalLM.generation_config <class 'transformers.generation.configuration_utils.GenerationConfig'>
GPTNeoXForCausalLM.get_output_embeddings <class 'method'>
GPTNeoXForCausalLM.gpt_neox <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPT

In [1]:
from torch.nn import ModuleList
import inspect

memory_unit_mb = 1024*1024

def display_modules(module, name_prefix=None, depth=0, max_depth=99, forward_methods=None):
    if forward_methods is None:
        forward_methods = {}
    header = module.__class__.__name__
    if name_prefix is not None:
        header = f"{name_prefix}#{header}" 
    depth_prefix = "  "*depth
    print(depth_prefix+"---------------------")
    print(depth_prefix+header)
    if len(list(module.named_parameters(recurse=False))) > 0:
        print(depth_prefix+"> parameters")
        for name,parameter in module.named_parameters(recurse=False):
            print(depth_prefix+f"- {name}: {get_tensor_description(parameter)}")
    if len(list(module.named_buffers(recurse=False))) > 0:
        print(depth_prefix+"> buffers")
        for name,buffer in module.named_buffers(recurse=False):
            print(depth_prefix+f"- {name}: {get_tensor_description(buffer)}")
    if len(list(module.named_children())) > 0:
        print(depth_prefix+"> submodules")
        for name,submodule in module.named_children():
            print(depth_prefix+f"- {name}: {submodule.__class__.__name__}")
    source_code = inspect.getsource(module.forward)
    forward_methods[module.__class__.__name__] = source_code
    if depth < max_depth:
        for name,submodule in module.named_children():
            if isinstance(submodule, ModuleList):
                display_module_list(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
            else:
                display_modules(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
    if depth==0:
        print()
        print()
        for module_type,source_code in forward_methods.items():
            print("---------------------")
            print(f"{module_type}.forward()")
            print("---------------------")
            print(source_code)
            
def display_module_list(module_list, name_prefix=None, depth=0, max_depth=1, forward_methods=None):
    # ------------------------------
    # Detect repeated layers in ModuleList: code inspired from Pytorch: ModuleList.__repr__    
    list_of_reprs = [repr(item) for item in module_list]
    if len(list_of_reprs) == 0:
        return

    start_end_indices = [[0, 0]]
    repeated_blocks = [list_of_reprs[0]]
    for i, r in enumerate(list_of_reprs[1:], 1):
        if r == repeated_blocks[-1]:
            start_end_indices[-1][1] += 1
            continue

        start_end_indices.append([i, i])
        repeated_blocks.append(r)
    # -------------------------------
    
    depth_prefix = "  "*depth
    print(depth_prefix+"---------------------")
    print(depth_prefix+f"{name_prefix}#ModuleList")
    print(depth_prefix+"> submodules")
    named_submodules = []
    for (start_id, end_id) in start_end_indices:
        submodule = module_list[start_id]
        if start_id != end_id:      
            name = f"{start_id}..{end_id}"
            print(depth_prefix+f"- {name}: {(end_id-start_id+1)}X {submodule.__class__.__name__}")
        else:
            name = str(start_id)
            print(depth_prefix+f"- {name}: {submodule.__class__.__name__}")        
        named_submodules.append((name,submodule))
    if depth < max_depth:
        for name,submodule in named_submodules:
            if isinstance(submodule, ModuleList):
                display_module_list(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
            else:
                display_modules(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)

def get_tensor_description(t):
    dtype = str(t.dtype)[6:]
    dimensions = str(t.size())[11:-1]
    total_byte_size = t.numel() * t.element_size()
    return f"{dtype} {dimensions} ({(total_byte_size/memory_unit_mb):.1f} MB)"

In [195]:
display_modules(model)

---------------------
GPTNeoXForCausalLM
> submodules
- gpt_neox: GPTNeoXModel
- embed_out: Linear
  ---------------------
  gpt_neox#GPTNeoXModel
  > submodules
  - embed_in: Embedding
  - emb_dropout: Dropout
  - layers: ModuleList
  - final_layer_norm: LayerNorm
    ---------------------
    embed_in#Embedding
    > parameters
    - weight: float16 [50432, 2560] (246.2 MB)
    ---------------------
    emb_dropout#Dropout
    ---------------------
    layers#ModuleList
    > submodules
    - 0..31: 32X GPTNeoXLayer
      ---------------------
      0..31#GPTNeoXLayer
      > submodules
      - input_layernorm: LayerNorm
      - post_attention_layernorm: LayerNorm
      - post_attention_dropout: Dropout
      - post_mlp_dropout: Dropout
      - attention: GPTNeoXAttention
      - mlp: GPTNeoXMLP
        ---------------------
        input_layernorm#LayerNorm
        > parameters
        - weight: float16 [2560] (0.0 MB)
        - bias: float16 [2560] (0.0 MB)
        ----------------

# Huggingface language models performance

### Install flash attention

https://pypi.org/project/flash-attn/

See supported combinations of Pytorch / Python / Cuda versions for prebuilt wheels at:

https://github.com/Dao-AILab/flash-attention/releases

Run this commands as root to upgrade Pytorch 2.0 to Pytorch 2.1

> pip install torch torchvision torchaudio

In [None]:
pip install torch torchvision torchaudio

In [None]:
pip install flash-attn --no-build-isolation

In [1]:
import torch
torch.__version__

'2.1.1+cu121'

In [2]:
import flash_attn
flash_attn.__version__

'2.3.6'

In [None]:
pip install psutil

Note: you may need to restart the kernel to use updated packages.


### Tests

In [246]:
from torch.utils import collect_env

collect_env.main()

Collecting environment information...
PyTorch version: 2.1.1+cu121
Is debug build: False
CUDA used to build PyTorch: 12.1
ROCM used to build PyTorch: N/A

OS: Ubuntu 22.04.3 LTS (x86_64)
GCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Clang version: Could not collect
CMake version: version 3.22.1
Libc version: glibc-2.35

Python version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)
Python platform: Linux-5.15.133.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
Is CUDA available: True
CUDA runtime version: 12.2.128
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration: GPU 0: NVIDIA GeForce RTX 4090
Nvidia driver version: 546.33
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True

CPU:
Architecture:                       x86_64
CPU op-mode(s):                     32-bit, 64-bit
Address sizes:                      46 bits physical, 48 bits virtual
Byte Order:                         Little Endi

In [1]:
import torch
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-Base-3B-v1", torch_dtype=torch.float16, device_map="auto")

In [211]:
model.device

device(type='cuda', index=0)

In [26]:
import ast
import inspect
import textwrap

def find_function_calls(func):
    if isinstance(func, types.BuiltinFunctionType):
        raise 
    
    source = inspect.getsource(func)
    source = textwrap.dedent(source)
    tree = ast.parse(source)
    
    function_calls = []

    class FunctionCallVisitor(ast.NodeVisitor):
        def visit_Call(self, node):
            function_calls.append(node)
            #if isinstance(node.func, ast.Name):
            #    function_calls.append(node.func.id)
            self.generic_visit(node)

    FunctionCallVisitor().visit(tree)
    
    function_calls_source = []
    
    for ast_call in function_calls:
        function_calls_source.append(get_function_call_source(ast_call, source))
    
    return function_calls, function_calls_source

def get_function_call_source(ast_call, source):
    function_call_source = ""
    
    for idx,line in enumerate(source.splitlines()):
        if idx+1 == ast_call.lineno:
            if ast_call.end_lineno > ast_call.lineno:
                function_call_source += line[ast_call.col_offset:].strip()
            else:
                function_call_source += line[ast_call.col_offset:ast_call.end_col_offset] 
        elif idx+1 > ast_call.lineno and idx+1 < ast_call.end_lineno:
            function_call_source += " " + line.strip()
        elif ast_call.end_lineno > ast_call.lineno and idx+1 == ast_call.end_lineno:
            function_call_source += " " + line[:ast_call.end_col_offset].strip()
        elif idx+1 > ast_call.end_lineno:
            break
    
    return f"[{ast_call.lineno},{ast_call.col_offset}...{ast_call.end_lineno},{ast_call.end_col_offset}] " + function_call_source

**IMPORTANT**

Fix to enbale CUDA profiling on WSL :

https://github.com/pytorch/pytorch/issues/99615#issuecomment-1827386273

In [219]:
def profile_forward(model, batch_size, seq_length):
    input_ids = torch.randint(low=0, high=32000, size=(batch_size,seq_length), dtype=torch.int64).to(model.device)
    attention_mask = torch.ones(batch_size,seq_length).to(model.device)
    model.eval()
    with torch.profiler.profile(use_cuda=True, record_shapes=True, profile_memory=True, with_stack=True, with_flops=True, with_modules=True, experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)) as prof:
        with torch.profiler.record_function("MODEL INFERENCE"):
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)        

    return prof

In [232]:
prof = profile_forward(model, 100, 1024)

STAGE:2023-12-16 18:05:11 118:118 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2023-12-16 18:05:34 118:118 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2023-12-16 18:05:34 118:118 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [233]:
events = prof.events()

In [234]:
def print_profiler_event(event, show_code_lines=0):
    csindex = 0
    call_site = event.stack[csindex].split(": ")[0]
    while call_site.startswith("<built-in") or call_site.startswith("torch/"):
        csindex += 1
        call_site = event.stack[csindex].split(": ")[0]
        
    if show_code_lines>0:
        base_dir = "/workspace/wordslab-llms/.venv/lib/python3.10/site-packages/"
        relative_path = call_site.split("(")[0]
        line_number = int(call_site.split("(")[1][:-1])
        try:
            with open(base_dir+relative_path, 'r', encoding='utf-8') as file:
                file_content = file.read()
                call_line = ""
                for idx,line in enumerate(file_content.splitlines()):
                    if idx>=line_number and idx<line_number+show_code_lines:
                        if idx==line_number:
                            call_line += ">>> "
                        else:
                            call_line += f"{idx} "
                        call_line += f"{line}\n"
        except:
            call_line = ""
    
    filtered_stack = []
    for frame in event.stack:
        if frame.endswith("profile_forward"):
            break
        elif not frame.startswith("<built-in") and not frame.startswith("torch/"):
            function = frame.split(": ")[1]
            if not frame.startswith("nn.Module") and not function=="forward":
                function = function
            filtered_stack.append(function)
    filtered_stack.reverse()
    call_stack = ".".join(filtered_stack)

    filtered_inputs = []
    for input_shape in event.input_shapes:
        if len(input_shape)>0:
            filtered_inputs.append(input_shape)
    
    print(f"- call stack: {call_stack}")
    print(f"- operation : {event.name}")
    print(f"- inputs    : {filtered_inputs}")
    print(f"- cpu time  : {event.cpu_time}")
    print(f"- cpu memory: {event.cpu_memory_usage}")
    print(f"- gpu time  : {event.cuda_time}")
    print(f"- gpu memory: {event.cuda_memory_usage}")
    print(f"- flops     : {event.flops}")    
    if show_code_lines>0:
        print(f"- func src  : {call_site}")
        print(call_line)

In [223]:
events[38]

<FunctionEvent id=33831 name=aten::linear device_type=DeviceType.CPU node_id=-1 cpu_time=59.000us start_us=1147 end_us=1206 cpu_children=[33832, 33833, 33834, 33837] cuda_time=0.000us name=aten::linear thread=1 input_shapes=[[4, 1024, 2560], [7680, 2560], [7680]] cpu_memory_usage=0 cuda_memory_usage=62914560 is_async=False is_remote=False seq_nr=-1 is_legacy=False>

In [237]:
for idx,event in enumerate(events[1:10000]):
    #if event.cpu_parent.id==events[0].id and (event.cpu_time+event.cuda_time)>=30:
    if event.cuda_time>0:
        print(idx+1)
        print_profiler_event(event)

In [228]:
event0 = events[4290]
[(attr,getattr(event0,attr)) for attr in dir(event0) if not attr.startswith("__") and not attr=="cpu_children" and not attr=="cpu_parent" and not callable(getattr(event0, attr))]

[('concrete_inputs',
  [None, [4, 32, 1024, 1024], [33554432, 1048576, 1024, 1], None]),
 ('count', 1),
 ('cpu_memory_usage', 0),
 ('cpu_time', 0.0),
 ('cpu_time_str', '0.000us'),
 ('cpu_time_total', 0),
 ('cpu_time_total_str', '0.000us'),
 ('cuda_memory_usage', 0),
 ('cuda_time', 0.0),
 ('cuda_time_str', '0.000us'),
 ('cuda_time_total', 0),
 ('cuda_time_total_str', '0.000us'),
 ('device_index', 118),
 ('device_type', <DeviceType.CPU: 0>),
 ('flops', 0),
 ('fwd_thread', 0),
 ('id', 38083),
 ('input_shapes', [[4, 32, 1024, 1024], [], [], []]),
 ('is_async', False),
 ('is_legacy', False),
 ('is_remote', False),
 ('kernels', []),
 ('key', 'aten::as_strided'),
 ('name', 'aten::as_strided'),
 ('node_id', -1),
 ('privateuse1_memory_usage', 0),
 ('privateuse1_time', 0.0),
 ('privateuse1_time_str', '0.000us'),
 ('privateuse1_time_total', 0),
 ('privateuse1_time_total_str', '0.000us'),
 ('scope', 0),
 ('self_cpu_memory_usage', 0),
 ('self_cpu_time_total', 0),
 ('self_cpu_time_total_str', '0.000

In [89]:
import inspect
import jedi

classobj = torch.nn.modules.Linear
script = jedi.Script(path=inspect.getfile(classobj))
script

<Script: '/usr/lib/python3/dist-packages/torch/nn/modules/linear.py' <SameEnvironment: 3.10.12 in /workspace/wordslab-llms/.venv>>

In [107]:
myfunctioncall = None

for name in script.get_names(all_scopes=True, definitions=True, references=True):
    if name.full_name == "torch.nn.modules.linear.Linear.forward.linear":
        myfunctioncall = name
        break

myfunctioncall

<Name full_name='torch.nn.modules.linear.Linear.forward.linear', description='linear'>

In [128]:
myfunctioncall.goto()



In [108]:
[attr for attr in dir(myfunctioncall) if not callable(getattr(myfunctioncall, attr)) and not attr.startswith("__")], [attr for attr in dir(myfunctioncall) if callable(getattr(myfunctioncall, attr)) and not attr.startswith("_")]

(['_inference_state',
  '_mapping',
  '_name',
  '_tuple_mapping',
  'column',
  'description',
  'full_name',
  'is_keyword',
  'line',
  'module_name',
  'module_path',
  'name',
  'type'],
 ['defined_names',
  'docstring',
  'execute',
  'get_definition_end_position',
  'get_definition_start_position',
  'get_line_code',
  'get_signatures',
  'get_type_hint',
  'goto',
  'in_builtin_module',
  'infer',
  'is_definition',
  'is_side_effect',
  'is_stub',
  'parent'])

In [137]:
import types

called_function = eval(script.search("F.linear")[0].full_name)
called_function, isinstance(called_function, types.BuiltinFunctionType)

(<function torch._C._nn.linear>, True)

In [76]:
# https://jedi.readthedocs.io/en/latest/docs/api-classes.html#name

myfunction = None

for function in script.get_names(all_scopes=True):
    if function.name == "forward" and function.type == 'function':
        myfunction = function
        break
        
[attr for attr in dir(myfunction) if not callable(getattr(myfunction, attr)) and not attr.startswith("__")], [attr for attr in dir(myfunction) if callable(getattr(myfunction, attr)) and not attr.startswith("_")]

(['_inference_state',
  '_mapping',
  '_name',
  '_tuple_mapping',
  'column',
  'description',
  'full_name',
  'is_keyword',
  'line',
  'module_name',
  'module_path',
  'name',
  'type'],
 ['defined_names',
  'docstring',
  'execute',
  'get_definition_end_position',
  'get_definition_start_position',
  'get_line_code',
  'get_signatures',
  'get_type_hint',
  'goto',
  'in_builtin_module',
  'infer',
  'is_definition',
  'is_side_effect',
  'is_stub',
  'parent'])

In [88]:
myfunction.

[<Name name='self', description='param self'>,
 <Name name='input', description='param input: Tensor'>]

In [79]:
ast_call = called_functions[0]
ast_call

<ast.Call at 0x7f23381ad4e0>

In [58]:
source = inspect.getsource(torch.nn.modules.Linear.forward)
source = textwrap.dedent(source)
function_called = ""
for idx,line in enumerate(source.splitlines()):
    if idx+1 == ast_func.lineno:
        if ast_func.end_lineno > ast_func.lineno:
            function_called += line[ast_func.col_offset:].strip()
        else:
            function_called += line[ast_func.col_offset:ast_func.end_col_offset] 
    elif idx+1 > ast_func.lineno and idx+1 < ast_func.end_lineno:
        function_called += " " + line.strip()
    elif ast_func.end_lineno > ast_func.lineno and idx+1 == ast_func.end_lineno:
        function_called += " " + line[:ast_func.end_col_offset].strip()
    elif idx+1 > ast_func.end_lineno:
        break
        
function_called

'F.linear'

In [17]:
ast_func = ast_call.func
ast_args = ast_call.args
ast_keywords = ast_call.keywords
(ast_func,ast_args,ast_keywords)

(<ast.Attribute at 0x7f23381ad5d0>,
 [<ast.Name at 0x7f23381aec20>,
  <ast.Attribute at 0x7f23381ae4d0>,
  <ast.Attribute at 0x7f23381ae290>],
 [])

In [29]:
ast_func.value.id, ast_func.attr

('F', 'linear')

In [45]:
ast_args[0].id,  ast_args[1].value.id, ast_args[1].attr, ast_args[2].value.id, ast_args[2].attr

('input', 'self', 'weight', 'self', 'bias')

In [46]:
ast_keywords

[]

In [51]:
obj = ast_func
[attr for attr in dir(obj) if not callable(getattr(obj, attr)) and not attr.startswith("__")]

['_attributes',
 '_fields',
 'attr',
 'col_offset',
 'ctx',
 'end_col_offset',
 'end_lineno',
 'lineno',
 'value']

### Ipyexperiments

https://github.com/stas00/ipyexperiments/blob/master/README.md

> pip install ipyexperiments

Usage:

```python
from ipyexperiments import IPyExperimentsPytorch
exp1 = IPyExperimentsPytorch()
...
exp1.keep_var_names('var1', 'var2')

# optional
data = exp1.finish()
cpu_data_final = data.cpu
gpu_data_final = data.gpu

del exp1
```

Detailed syntax:

```python
exp = IPyExperimentsPytorch(cl_enable=True, cl_compact=False, cl_gc_collect=True, cl_set_seed=0)
```

- cl_enable - enable the subsystem
- cl_compact - use compact one line printouts
- cl_gc_collect - get correct memory usage reports. Don't use when tracking memory leaks (objects with circular reference).
- cl_set_seed - set RNG seed before each cell is run to the provided seed value

In [1]:
pip install ipyexperiments

Note: you may need to restart the kernel to use updated packages.


In [1]:
from torch.nn import ModuleList
import inspect

memory_unit_mb = 1024*1024

def display_modules(module, name_prefix=None, depth=0, max_depth=99, forward_methods=None):
    if forward_methods is None:
        forward_methods = {}
    header = module.__class__.__name__
    if name_prefix is not None:
        header = f"{name_prefix}#{header}" 
    depth_prefix = "  "*depth
    print(depth_prefix+"---------------------")
    print(depth_prefix+header)
    if len(list(module.named_parameters(recurse=False))) > 0:
        print(depth_prefix+"> parameters")
        for name,parameter in module.named_parameters(recurse=False):
            print(depth_prefix+f"- {name}: {get_tensor_description(parameter)}")
    if len(list(module.named_buffers(recurse=False))) > 0:
        print(depth_prefix+"> buffers")
        for name,buffer in module.named_buffers(recurse=False):
            print(depth_prefix+f"- {name}: {get_tensor_description(buffer)}")
    if len(list(module.named_children())) > 0:
        print(depth_prefix+"> submodules")
        for name,submodule in module.named_children():
            print(depth_prefix+f"- {name}: {submodule.__class__.__name__}")
    source_code = inspect.getsource(module.forward)
    forward_methods[module.__class__.__name__] = source_code
    if depth < max_depth:
        for name,submodule in module.named_children():
            if isinstance(submodule, ModuleList):
                display_module_list(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
            else:
                display_modules(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
    if depth==0:
        print()
        print()
        for module_type,source_code in forward_methods.items():
            print("---------------------")
            print(f"{module_type}.forward()")
            print("---------------------")
            print(source_code)
            
def display_module_list(module_list, name_prefix=None, depth=0, max_depth=1, forward_methods=None):
    # ------------------------------
    # Detect repeated layers in ModuleList: code inspired from Pytorch: ModuleList.__repr__    
    list_of_reprs = [repr(item) for item in module_list]
    if len(list_of_reprs) == 0:
        return

    start_end_indices = [[0, 0]]
    repeated_blocks = [list_of_reprs[0]]
    for i, r in enumerate(list_of_reprs[1:], 1):
        if r == repeated_blocks[-1]:
            start_end_indices[-1][1] += 1
            continue

        start_end_indices.append([i, i])
        repeated_blocks.append(r)
    # -------------------------------
    
    depth_prefix = "  "*depth
    print(depth_prefix+"---------------------")
    print(depth_prefix+f"{name_prefix}#ModuleList")
    print(depth_prefix+"> submodules")
    named_submodules = []
    for (start_id, end_id) in start_end_indices:
        submodule = module_list[start_id]
        if start_id != end_id:      
            name = f"{start_id}..{end_id}"
            print(depth_prefix+f"- {name}: {(end_id-start_id+1)}X {submodule.__class__.__name__}")
        else:
            name = str(start_id)
            print(depth_prefix+f"- {name}: {submodule.__class__.__name__}")        
        named_submodules.append((name,submodule))
    if depth < max_depth:
        for name,submodule in named_submodules:
            if isinstance(submodule, ModuleList):
                display_module_list(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
            else:
                display_modules(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)

def get_tensor_description(t):
    dtype = str(t.dtype)[6:]
    dimensions = str(t.size())[11:-1]
    total_byte_size = t.numel() * t.element_size()
    return f"{dtype} {dimensions} ({(total_byte_size/memory_unit_mb):.1f} MB)"

In [2]:
import os

# https://pytorch.org/docs/stable/notes/cuda.html#environment-variables
# https://zdevito.github.io/2022/08/04/cuda-caching-allocator.html
# Need to test the line below with Pytorch 2.1
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "roundup_power2_divisions:4"

import psutil
import torch
from transformers.utils.hub import cached_file

from datetime import datetime
from IPython.display import HTML
import pickle

def get_model_path_and_size_on_disk(model):    
    model_config_file = cached_file(model.name_or_path, "config.json", local_files_only=True)
    model_directory = os.path.dirname(model_config_file)
    
    total_size = 0
    for entry in os.listdir(model_directory):
        full_entry_path = os.path.join(model_directory, entry)
        if os.path.isfile(full_entry_path):
            total_size += os.path.getsize(full_entry_path)
    return model_directory,total_size

def get_used_cpu_memory():
    process = psutil.Process(os.getpid())
    process_memory = process.memory_info().rss
    return process_memory

def get_used_and_max_gpu_memory():
    used_memory = torch.cuda.memory_allocated(0)    
    max_used_memory = torch.cuda.max_memory_allocated(0)
    return used_memory,max_used_memory

def reset_max_gpu_memory():
    torch.cuda.reset_peak_memory_stats()
    
def release_cached_memory():
    torch.cuda.empty_cache()
    
def record_memory_history(enabled):
    torch.cuda.memory._record_memory_history(enabled=enabled)
    
def dump_memory_snapshot():
    filename_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"memory_snapshot_{filename_datetime}.pickle"
    s = torch.cuda.memory._snapshot(0)
    with open(filename, "wb") as f:
        pickle.dump(s, f)
    print(f"Dumped memory snapshot to file: {filename}")

# https://zdevito.github.io/2022/08/16/memory-snapshots.html
# https://zdevito.github.io/2022/12/09/memory-traces.html

def display_memory_snapshot():
    url = "https://pytorch.org/memory_viz"
    return HTML(f"Call dump_memory_snapshot(), <a href='{url}' target='_blank'>click here to open Pytorch memory viz</a>, then drag and drop the snapshot file")

In [3]:
from time import perf_counter_ns
from transformers import AutoModelForCausalLM, AutoTokenizer

memory_unit_mb = 1024*1024
memory_unit_gb = 1024*1024*1024

time_unit_µs = 1000
time_unit_ms = 1000*1000
time_unit_s = 1000*1000*1000

def get_tensor_params_size_and_dim(param):
    if param is None:
        return 0,""
    elif isinstance(param, torch.Tensor):
        psize = param.numel() * param.element_size()
        pdim = f"{str(param.dtype)[6:]}{str(param.size())[11:-1]}"
        return psize,pdim
    elif isinstance(param, dict):
        size = 0
        dim = ""
        for value in param.values():            
            psize, pdim = get_tensor_params_size_and_dim(value)
            size += psize
            dim += pdim
        return size, dim
    else:
        try:
            iter(param)
            size = 0
            dim = ""
            for value in param:            
                psize, pdim = get_tensor_params_size_and_dim(value)
                size += psize
                dim += pdim
            return size, dim
        except TypeError:
            return 0,""

class ModulePerf:
    
    def __init__(self, module_name, module, is_leaf_module):
        self.module_name = module_name
        self.module = module
        self.is_leaf_module = is_leaf_module
        
        self.before_forward_time_ns = 0
        self.before_forward_used_memory = 0
        self.forward_inputs_memory_size = 0 
        self.forward_inputs_memory_dim = "" 
        
        self.after_forward_time_ns = 0
        self.after_forward_used_memory = 0
        self.forward_max_used_memory = 0        
        self.forward_outputs_memory_size = 0
        self.forward_outputs_memory_dim = ""
        
        self.before_backward_time_ns = 0
        self.before_backward_used_memory = 0
        self.backward_inputs_memory_size = 0
        self.backward_inputs_memory_dim = "" 
        
        self.after_backward_time_ns = 0
        self.after_backward_used_memory = 0
        self.backward_max_used_memory = 0
        self.backward_outputs_memory_size = 0
        self.backward_outputs_memory_dim = ""
        
    def before_forward(self, module, args, kwargs):
        self.before_forward_time_ns = perf_counter_ns()
        self.before_forward_used_memory,_ = get_used_and_max_gpu_memory()   
        args_size,args_dim = get_tensor_params_size_and_dim(args)
        kwargs_size,kwargs_dim = get_tensor_params_size_and_dim(kwargs) 
        self.forward_inputs_memory_size = args_size + kwargs_size
        self.forward_inputs_memory_dim = args_dim + kwargs_dim
        if self.is_leaf_module: reset_max_gpu_memory()
        
    def after_forward(self, module, args, kwargs, output):
        self.after_forward_time_ns = perf_counter_ns()
        self.after_forward_used_memory, self.forward_max_used_memory = get_used_and_max_gpu_memory()
        self.forward_outputs_memory_size, self.forward_outputs_memory_dim = get_tensor_params_size_and_dim(output) 
        
    def before_backward(self, module, grad_output):
        self.before_backward_time_ns = perf_counter_ns()
        self.before_backward_used_memory,_ = get_used_and_max_gpu_memory()
        self.backward_inputs_memory_size, self.backward_inputs_memory_dim = get_tensor_params_size_and_dim(grad_output) 
        
    def after_backward(self, module, grad_input, grad_output):
        self.after_backward_time_ns = perf_counter_ns()
        self.after_backward_used_memory, self.backward_max_used_memory = get_used_and_max_gpu_memory()
        self.backward_outputs_memory_size, self.backward_outputs_memory_dim = get_tensor_params_size_and_dim(grad_input) 
    
    def get_stats_line(self, initial_used_memory):
        return f"{self.module_name};{self.is_leaf_module};;{(self.after_forward_time_ns-self.before_forward_time_ns)/time_unit_µs:.1f};;{self.forward_inputs_memory_dim};{self.forward_inputs_memory_size/memory_unit_mb:.1f};{(self.before_forward_used_memory-initial_used_memory)/memory_unit_mb:.1f};{(self.forward_max_used_memory-initial_used_memory)/memory_unit_mb:.1f};{(self.after_forward_used_memory-initial_used_memory)/memory_unit_mb:.1f};{self.forward_outputs_memory_dim};{self.forward_outputs_memory_size/memory_unit_mb:.1f};;{(self.after_backward_time_ns-self.before_backward_time_ns)/time_unit_µs:.1f};;{self.backward_inputs_memory_dim};{(self.backward_inputs_memory_size-initial_used_memory)/memory_unit_mb:.1f};{(self.before_backward_used_memory-initial_used_memory)/memory_unit_mb:.1f};{(self.backward_max_used_memory-initial_used_memory)/memory_unit_mb:.1f};{self.after_backward_used_memory/memory_unit_mb:.1f};{self.backward_outputs_memory_dim};{self.backward_outputs_memory_size/memory_unit_mb:.1f}"
    
class ModelForCausalLMBenchmark:   
    
    @staticmethod
    def download_in_local_cache(pretrained_model_id, **kwargs):
        print(f"Loading model {pretrained_model_id} in local cache ...")
        AutoTokenizer.from_pretrained(pretrained_model_id, **kwargs)
        model = AutoModelForCausalLM.from_pretrained(pretrained_model_id, **kwargs)
        path,size = get_model_path_and_size_on_disk(model)
        print(f"--> model files size   : {(size/memory_unit_gb):.2f} GB")
        print(f"--> stored in directory: {path}")
        print()
    
    def __init__(self, pretrained_model_id):
        self.pretrained_model_id = pretrained_model_id
        self.tokenizer = None 
        self.model = None
        
        self.model_path = None
        self.model_size_on_disk = 0
        self.tokenizer_load_time_ns = 0
        self.tokenizer_cpu_memory = 0
        self.model_load_time_ns = 0
        self.model_cpu_memory = 0
        self.model_gpu_memory = 0
        self.model_load_max_gpu_memory = 0
        
    def trace_load_from_cache(self, **kwargs):
        cpu_memory_before = get_used_cpu_memory()
        gpu_memory_before = get_used_and_max_gpu_memory()[0]
        reset_max_gpu_memory()        
        time_before = perf_counter_ns()
        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_id, **kwargs)
        cpu_memory_tokenizer = get_used_cpu_memory()
        time_tokenizer = perf_counter_ns()
        self.model = AutoModelForCausalLM.from_pretrained(self.pretrained_model_id, **kwargs)
        cpu_memory_model = get_used_cpu_memory()
        gpu_memory_model,max_gpu_memory_model = get_used_and_max_gpu_memory()     
        time_model = perf_counter_ns()
        
        self.model_path,self.model_size_on_disk = get_model_path_and_size_on_disk(self.model)
        self.tokenizer_load_time_ns = time_tokenizer-time_before
        self.tokenizer_cpu_memory = cpu_memory_tokenizer-cpu_memory_before
        self.model_load_time_ns = time_model-time_tokenizer
        self.model_cpu_memory = cpu_memory_model-cpu_memory_tokenizer
        self.model_gpu_memory = gpu_memory_model-gpu_memory_before
        self.model_load_max_gpu_memory = max_gpu_memory_model
        
        self.display_load_results()            
    
    def display_load_results(self):
        print(f"Model files: {(self.model_size_on_disk/1024/1024/1024):.2f} GB on disk")
        print(""f"(cache path: {self.model_path})")
        print()
        print(f"Tokenizer load time : {(self.tokenizer_load_time_ns/time_unit_ms):.2f} ms")
        print(f"Tokenizer CPU memory: {(self.tokenizer_cpu_memory/memory_unit_mb):.2f} MB")
        print()
        print(f"Model load time : {(self.model_load_time_ns/time_unit_ms):.2f} ms")
        print(f"Model CPU memory: {(self.model_cpu_memory/memory_unit_gb):.2f} GB")
        print(f"Model GPU memory: {(self.model_gpu_memory/memory_unit_gb):.2f} GB")
        print(f"Max   GPU memory: {(self.model_load_max_gpu_memory/memory_unit_gb):.2f} GB")
        print()
        
    def trace_prefill(self, batch_size, seq_length):
        # warmup
        input_ids = torch.randint(low=0, high=self.tokenizer.vocab_size, size=(batch_size,seq_length), dtype=torch.int64).to(self.model.device)
        attention_mask = torch.ones(batch_size,seq_length).to(self.model.device)
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)        
        
        # measure perfs
        moduleperfs = []
        hookhandles = []
        try:
            for module_name,module in self.model.named_modules():
                if module_name=="": module_name="<model>"
                mperf = ModulePerf(module_name, module, len(list(module.children())) == 0)
                moduleperfs.append(mperf)                
                hookhandles.append(module.register_forward_pre_hook(mperf.before_forward, with_kwargs=True))
                hookhandles.append(module.register_forward_hook(mperf.after_forward, with_kwargs=True))
                hookhandles.append(module.register_full_backward_pre_hook(mperf.before_backward))
                hookhandles.append(module.register_full_backward_hook(mperf.after_backward))
            
            # perf test
            input_ids = torch.randint(low=0, high=self.tokenizer.vocab_size, size=(batch_size,seq_length), dtype=torch.int64).to(self.model.device)
            attention_mask = torch.ones(batch_size,seq_length).to(self.model.device)
            self.model.eval()
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)
        finally:
            for handle in hookhandles:
                handle.remove()    
                
        # sort modules
        sorted_moduleperfs = sorted(moduleperfs, key=lambda mp: mp.after_forward_time_ns)
        first_mperf = None
        for mperf in sorted_moduleperfs:
            if mperf.before_forward_used_memory>0:
                first_mperf = mperf
                break
        initial_used_memory = first_mperf.before_forward_used_memory - first_mperf.forward_inputs_memory_size
        
        # display results
        print(f"Prefill test for batch size {batch_size} and sequence length {seq_length}:")
        for mperf in sorted_moduleperfs:
            if mperf.after_forward_time_ns>0:
                print(mperf.get_stats_line(initial_used_memory))
    
    def check_prefill(self, max_batch_size):
        seq_length = self.tokenizer.model_max_length
        batch_size = 1
        
        # warmup
        input_ids = torch.randint(low=0, high=self.tokenizer.vocab_size, size=(batch_size,seq_length), dtype=torch.int64).to("cuda")
        attention_mask = torch.ones(batch_size,seq_length).to("cuda")
        self.model.eval()
        with torch.no_grad():
            self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)        

        # perf test
        base_gpu_memory,_ = get_used_and_max_gpu_memory()
        seq_length = 128
        while seq_length <= self.tokenizer.model_max_length:
            for batch_size in range(1,max_batch_size+1):
                #print(f"--- {batch_size} x {seq_length} ---")
                reset_max_gpu_memory()
                initial_gpu_memory,_ = get_used_and_max_gpu_memory()
                input_ids = torch.randint(low=0, high=self.tokenizer.vocab_size, size=(batch_size,seq_length), dtype=torch.int64).to(self.model.device)
                attention_mask = torch.ones(batch_size,seq_length).to(self.model.device)
                before_forward_time_ns = perf_counter_ns()
                with torch.no_grad():
                    self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)        
                after_forward_time_ns = perf_counter_ns()
                before_release = perf_counter_ns()
                # https://pytorch.org/docs/stable/notes/cuda.html#environment-variables
                # see "expandable_segments": Pytorch allocator doesn't work when we gradually increase batch size !
                # when inferencing with a constant batch size, this should not be needed
                release_cached_memory()
                after_release = perf_counter_ns()            
                gpu_memory, max_gpu_memory = get_used_and_max_gpu_memory()
                #print(f"Forward pass  : {(after_forward_time_ns-before_forward_time_ns)/time_unit_ms:.1f} ms")
                #print(f"Initial memory  : {((initial_gpu_memory-base_gpu_memory)/memory_unit_gb):.2f} GB")
                #print(f"Maximum memory: {((max_gpu_memory-initial_gpu_memory)/memory_unit_gb):.2f} GB")
                #print(f"Final memory  : {((gpu_memory-base_gpu_memory)/memory_unit_gb):.2f} GB")
                #print(f"+ GPU cache release  : {(after_release-before_release)/time_unit_ms:.1f} ms")
                print(f"{batch_size},{seq_length},{batch_size*seq_length/(after_forward_time_ns-before_forward_time_ns)*time_unit_s:.2f},{(after_forward_time_ns-before_forward_time_ns)/time_unit_ms:.2f},{(initial_gpu_memory/memory_unit_gb):.2f},{((max_gpu_memory-initial_gpu_memory)/memory_unit_gb):.2f},{(max_gpu_memory/memory_unit_gb):.2f}")
            seq_length *= 2
    
    def trace_generate(self):
        return
    
    def trace_train(self):
        return

In [4]:
from ipyexperiments import IPyExperimentsPytorch

# Redpajama-3B

In [7]:
with IPyExperimentsPytorch(cl_enable=False):
    model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
    ModelForCausalLMBenchmark.download_in_local_cache(model_id)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     187  14,273  15,837 MB   1.18% 
GPU:   1,227  23,336  24,564 MB   5.00% 


Loading model togethercomputer/RedPajama-INCITE-Base-3B-v1 in local cache ...
--> model files size   : 5.30 GB
--> stored in directory: /models/huggingface/transformers/models--togethercomputer--RedPajama-INCITE-Base-3B-v1/snapshots/094fbdd0c911feb485ce55de1952ab2e75277e1e


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:00:19 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_id

*** Circular ref objects gc collected during the experiment:
cleared 93 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:      993     -111 MB (-11.25%)
GPU:        0        0 MB (100.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:   1,293  13,175  15,837 MB   8.17%

## Analysis

In [None]:
exp = IPyExperimentsPytorch(cl_enable=False)

In [None]:
model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype=torch.float16)

In [None]:
model_benchmark.trace_prefill(20, 2048)

For MPT-3B, the line of code which triggers the maximum memory is this one:

transformers/models/gpt_neox/modeling_gpt_neox.py
```
def _attn(self, query, key, value, attention_mask=None, head_mask=None):
...
    attn_weights = nn.functional.softmax(attn_scores, dim=-1)
```
 
attn_weights is a very large matrix of size: float16[20, 32, 2048, 2048] => 5120 MB of memory.

On this line we need to allocate it twice.
An inplace softmax would divide the memory requirements by a factor of 2 !

https://lernapparat.de/pytorch-inplace

In [None]:
del exp

## 16 bits

In [11]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:   1,303  13,105  15,837 MB   8.23% 
GPU:   1,227  23,336  24,564 MB   5.00% 




In [12]:
model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype=torch.float16)

Model files: 5.30 GB on disk
(cache path: /models/huggingface/transformers/models--togethercomputer--RedPajama-INCITE-Base-3B-v1/snapshots/094fbdd0c911feb485ce55de1952ab2e75277e1e)

Tokenizer load time : 403.71 ms
Tokenizer CPU memory: 24.60 MB

Model load time : 7301.54 ms
Model CPU memory: 0.48 GB
Model GPU memory: 5.33 GB
Max   GPU memory: 5.33 GB



In [13]:
model_benchmark.check_prefill(30)

1,128,26.00,0.01
2,128,21.64,0.03
3,128,23.94,0.04
4,128,26.08,0.05
5,128,33.23,0.06
6,128,37.86,0.08
7,128,44.32,0.09
8,128,44.16,0.10
9,128,51.76,0.11
10,128,57.60,0.13
11,128,61.57,0.14
12,128,78.21,0.15
13,128,70.41,0.16
14,128,75.92,0.18
15,128,79.01,0.19
16,128,83.47,0.20
17,128,89.58,0.22
18,128,95.11,0.23
19,128,99.08,0.24
20,128,104.85,0.25
21,128,111.69,0.27
22,128,130.38,0.28
23,128,120.73,0.29
24,128,124.92,0.30
25,128,132.10,0.32
26,128,140.93,0.33
27,128,141.85,0.34
28,128,148.08,0.35
29,128,154.12,0.37
1,256,31.73,0.03
2,256,28.20,0.05
3,256,39.70,0.08
4,256,45.57,0.10
5,256,57.65,0.13
6,256,66.39,0.15
7,256,90.81,0.18
8,256,88.43,0.20
9,256,102.81,0.23
10,256,115.68,0.25
11,256,127.42,0.28
12,256,138.54,0.30
13,256,152.49,0.33
14,256,163.05,0.35
15,256,174.82,0.38
16,256,185.17,0.40
17,256,200.74,0.43
18,256,211.15,0.46
19,256,224.28,0.48
20,256,235.55,0.51
21,256,249.25,0.53
22,256,262.17,0.56
23,256,279.91,0.58
24,256,284.02,0.61
25,256,298.43,0.63
26,256,348.39,0.66


In [14]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:04:14 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 2046 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:      576        0 MB (  0.00%)
GPU:    5,296    5,524 MB (104.29%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:   1,880  12,604  15,837 MB  11.87% 
GPU:   1,000  23,563  24,564 MB   4.07% 




## 8 bits

In [15]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:   1,880  12,601  15,837 MB  11.87% 
GPU:   1,000  23,563  24,564 MB   4.07% 




In [16]:
model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", load_in_8bit=True)

Model files: 5.30 GB on disk
(cache path: /models/huggingface/transformers/models--togethercomputer--RedPajama-INCITE-Base-3B-v1/snapshots/094fbdd0c911feb485ce55de1952ab2e75277e1e)

Tokenizer load time : 234.35 ms
Tokenizer CPU memory: 0.00 MB

Model load time : 3867.51 ms
Model CPU memory: 0.43 GB
Model GPU memory: 3.06 GB
Max   GPU memory: 3.07 GB



In [17]:
model_benchmark.check_prefill(30)

2023-11-25 18:26:59.999616: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-25 18:27:00.165015: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


1,128,139.23,0.01
2,128,126.06,0.03
3,128,150.38,0.04
4,128,128.92,0.06
5,128,148.06,0.07
6,128,99.86,0.09
7,128,110.90,0.10
8,128,112.90,0.11
9,128,146.82,0.13
10,128,98.36,0.14
11,128,118.09,0.15
12,128,162.51,0.17
13,128,125.73,0.18
14,128,131.17,0.20
15,128,181.03,0.21
16,128,150.02,0.22
17,128,168.77,0.23
18,128,170.73,0.25
19,128,159.60,0.26
20,128,188.03,0.28
21,128,192.24,0.29
22,128,243.53,0.30
23,128,175.19,0.32
24,128,192.78,0.33
25,128,215.48,0.35
26,128,227.16,0.36
27,128,235.21,0.37
28,128,236.02,0.39
29,128,240.61,0.40
1,256,119.39,0.03
2,256,144.90,0.06
3,256,136.77,0.08
4,256,112.83,0.11
5,256,153.43,0.14
6,256,144.35,0.16
7,256,158.68,0.20
8,256,160.82,0.22
9,256,190.68,0.25
10,256,199.55,0.28
11,256,210.79,0.30
12,256,223.55,0.33
13,256,232.53,0.36
14,256,258.61,0.39
15,256,258.73,0.41
16,256,283.96,0.44
17,256,284.48,0.47
18,256,277.84,0.50
19,256,302.58,0.53
20,256,322.53,0.55
21,256,342.52,0.58
22,256,335.89,0.61
23,256,360.73,0.63
24,256,382.35,0.66
25,256,388.89

In [18]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:10:51 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 11457 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:      802        0 MB (  0.00%)
GPU:    3,400    3,324 MB ( 97.76%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:   2,683  12,049  15,837 MB  16.94% 
GPU:   1,076  23,487  24,564 MB   4.38% 




## 4 bits

In [19]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:   2,683  12,049  15,837 MB  16.94% 
GPU:   1,076  23,487  24,564 MB   4.38% 




In [20]:
model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", load_in_4bit=True)

Model files: 5.30 GB on disk
(cache path: /models/huggingface/transformers/models--togethercomputer--RedPajama-INCITE-Base-3B-v1/snapshots/094fbdd0c911feb485ce55de1952ab2e75277e1e)

Tokenizer load time : 244.97 ms
Tokenizer CPU memory: 0.00 MB

Model load time : 4242.37 ms
Model CPU memory: 0.40 GB
Model GPU memory: 1.95 GB
Max   GPU memory: 1.96 GB



In [21]:
model_benchmark.check_prefill(30)



1,128,60.92,0.16
2,128,71.53,0.17
3,128,84.30,0.18
4,128,97.68,0.19
5,128,118.82,0.20
6,128,129.78,0.21
7,128,145.19,0.22
8,128,155.80,0.23
9,128,175.67,0.24
10,128,191.66,0.25
11,128,203.37,0.26
12,128,221.43,0.27
13,128,241.67,0.28
14,128,254.04,0.29
15,128,271.51,0.30
16,128,282.05,0.31
17,128,298.93,0.32
18,128,320.51,0.33
19,128,339.63,0.35
20,128,353.81,0.36
21,128,369.18,0.37
22,128,383.35,0.38
23,128,405.28,0.39
24,128,412.20,0.40
25,128,434.94,0.41
26,128,453.60,0.42
27,128,469.91,0.43
28,128,527.36,0.44
29,128,499.12,0.45
1,256,70.25,0.17
2,256,97.60,0.19
3,256,133.43,0.21
4,256,158.39,0.23
5,256,192.77,0.25
6,256,223.31,0.27
7,256,255.22,0.29
8,256,289.06,0.31
9,256,324.01,0.33
10,256,359.26,0.36
11,256,397.50,0.38
12,256,422.38,0.40
13,256,465.23,0.42
14,256,494.10,0.44
15,256,524.89,0.46
16,256,554.36,0.48
17,256,594.87,0.50
18,256,625.88,0.52
19,256,657.29,0.54
20,256,693.34,0.57
21,256,734.68,0.59
22,256,765.58,0.61
23,256,798.60,0.63
24,256,865.42,0.66
25,256,869.00,0.6

In [22]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:10:21 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 13086 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:      406      416 MB (102.46%)
GPU:    2,130    2,002 MB ( 93.99%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:   2,673  12,018  15,837 MB  16.88% 
GPU:   1,204  23,359  24,564 MB   4.91% 




# StableLM-3B

stabilityai/stablelm-3b-4e1t

https://huggingface.co/stabilityai/stablelm-3b-4e1t

In [4]:
with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

In [6]:
with IPyExperimentsPytorch(cl_enable=False):
    model_id = "stabilityai/stablelm-3b-4e1t"
    ModelForCausalLMBenchmark.download_in_local_cache(model_id, trust_remote_code=True, token=myhftoken)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     230  14,298  15,837 MB   1.45% 
GPU:   1,122  23,441  24,564 MB   4.57% 


Loading model stabilityai/stablelm-3b-4e1t in local cache ...
--> model files size   : 5.21 GB
--> stored in directory: /models/huggingface/transformers/models--stabilityai--stablelm-3b-4e1t/snapshots/c6554ba60f40a8252d2a43e38e55ee2e3a645813


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:00:12 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_id

*** Circular ref objects gc collected during the experiment:
cleared 93 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:     -180     -110 MB ( 61.27%)
GPU:        0        0 MB (100.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     160  14,382  15,837 MB   1.01% 
GPU:   1,122  23,441  24,564 M

## Analysis

In [15]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     971  29,608  31,785 MB   3.06% 
GPU:   1,404  23,159  24,564 MB   5.72% 




In [16]:
model_id = "stabilityai/stablelm-3b-4e1t"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto", trust_remote_code=True, token=myhftoken)

Model files: 5.21 GB on disk
(cache path: /models/huggingface/transformers/models--stabilityai--stablelm-3b-4e1t/snapshots/c6554ba60f40a8252d2a43e38e55ee2e3a645813)

Tokenizer load time : 256.93 ms
Tokenizer CPU memory: 2.06 MB

Model load time : 1622.29 ms
Model CPU memory: 0.01 GB
Model GPU memory: 5.24 GB
Max   GPU memory: 5.25 GB



In [None]:
try:
    model_benchmark.trace_prefill(4, 4096)
finally:
    del model_benchmark

In [13]:
release_cached_memory()
get_used_and_max_gpu_memory()

(8519680, 11880278016)

In [14]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:05:26 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_id

*** Circular ref objects gc collected during the experiment:
cleared 551 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:       13        0 MB (  0.00%)
GPU:        0        0 MB (100.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     971  29,609  31,785 MB   3.06% 
GPU:   1,404  23,159  24,564 MB   5.72% 




In [8]:
display_modules(model_benchmark.model)

---------------------
StableLMEpochForCausalLM
> submodules
- model: StableLMEpochModel
- lm_head: Linear
  ---------------------
  model#StableLMEpochModel
  > submodules
  - embed_tokens: Embedding
  - layers: ModuleList
  - norm: LayerNorm
    ---------------------
    embed_tokens#Embedding
    > parameters
    - weight: bfloat16 [50304, 2560] (245.6 MB)
    ---------------------
    layers#ModuleList
    > submodules
    - 0..31: 32X DecoderLayer
      ---------------------
      0..31#DecoderLayer
      > submodules
      - self_attn: Attention
      - mlp: MLP
      - input_layernorm: LayerNorm
      - post_attention_layernorm: LayerNorm
        ---------------------
        self_attn#Attention
        > submodules
        - q_proj: Linear
        - k_proj: Linear
        - v_proj: Linear
        - o_proj: Linear
        - rotary_emb: RotaryEmbedding
          ---------------------
          q_proj#Linear
          > parameters
          - weight: bfloat16 [2560, 2560] (12.5 MB)

In [7]:
logits = outputs['logits']
past_key_values = outputs['past_key_values']

In [10]:
logits.size(),logits.dtype

(torch.Size([2, 1000, 50432]), torch.float16)

In [14]:
len(past_key_values), past_key_values[0][0].size(), past_key_values[0][0].dtype, past_key_values[0][1].size(), past_key_values[0][1].dtype

(32,
 torch.Size([2, 32, 1000, 80]),
 torch.float16,
 torch.Size([2, 32, 1000, 80]),
 torch.float16)

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
encodings = tokenizer(["un test","un deuxième test"], padding="longest", return_tensors="pt")

In [10]:
encodings

{'input_ids': tensor([[  328,  1071,     0,     0,     0],
        [  328, 23156,    74, 22722,  1071]]), 'attention_mask': tensor([[1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1]])}

In [13]:
encodings["input_ids"].size(), encodings["input_ids"].dtype

(torch.Size([2, 5]), torch.int64)

In [14]:
encodings["attention_mask"].size(), encodings["attention_mask"].dtype

(torch.Size([2, 5]), torch.int64)

In [22]:
output["logits"].size(), output["logits"].dtype

(torch.Size([2, 5, 50432]), torch.float16)

In [21]:
output.keys()

odict_keys(['logits', 'past_key_values'])

In [27]:
len(output["past_key_values"]),output["past_key_values"][0][1].size(),output["past_key_values"][0][1].dtype

(32, torch.Size([2, 32, 5, 80]), torch.float16)

# Falcon-7B

tiiuae/falcon-7b

https://huggingface.co/tiiuae/falcon-7b

In [7]:
with IPyExperimentsPytorch(cl_enable=False):
    model_id = "tiiuae/falcon-7b"
    ModelForCausalLMBenchmark.download_in_local_cache(model_id)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     556  29,651  31,785 MB   1.75% 
GPU:   1,395  23,168  24,564 MB   5.68% 


Loading model tiiuae/falcon-7b in local cache ...


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

--> model files size   : 13.45 GB
--> stored in directory: /models/huggingface/transformers/models--tiiuae--falcon-7b/snapshots/898df1396f35e447d5fe44e0a3ccaaaa69f30d36


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:14:32 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_id

*** Circular ref objects gc collected during the experiment:
cleared 105 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:     -408        0 MB (  0.09%)
GPU:      -16        0 MB ( -0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     148  30,360  31,785 MB   0.47% 
GPU:   1,379  23,184  24,564 MB   5.61% 




## Analysis

In [5]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     559  29,911  31,785 MB   1.76% 
GPU:     996  23,567  24,564 MB   4.06% 




In [7]:
model_id = "tiiuae/falcon-7b"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model files: 13.45 GB on disk
(cache path: /models/huggingface/transformers/models--tiiuae--falcon-7b/snapshots/898df1396f35e447d5fe44e0a3ccaaaa69f30d36)

Tokenizer load time : 289.48 ms
Tokenizer CPU memory: 30.38 MB

Model load time : 25717.15 ms
Model CPU memory: 0.02 GB
Model GPU memory: 12.94 GB
Max   GPU memory: 13.49 GB



In [7]:
try:
    model_benchmark.trace_prefill(6, 2048)
finally:
    del model_benchmark

The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call `model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations.


Prefill test for batch size 6 and sequence length 2048:
transformer.word_embeddings;True;;222.1;;int64[6, 2048];0.1;0.1;106.6;106.6;bfloat16[6, 2048, 4544];106.5;;0.0;;;-14800.5;-14800.5;-14800.5;0.0;;0.0
transformer.h.0.input_layernorm;True;;178.6;;bfloat16[6, 2048, 4544];106.5;154.6;261.2;261.1;bfloat16[6, 2048, 4544];106.5;;0.0;;;-14800.5;-14800.5;-14800.5;0.0;;0.0
transformer.h.0.self_attention.query_key_value;True;;206.6;;bfloat16[6, 2048, 4544];106.5;261.1;370.6;370.6;bfloat16[6, 2048, 4672];109.5;;0.0;;;-14800.5;-14800.5;-14800.5;0.0;;0.0
transformer.h.0.self_attention.maybe_rotary;True;;407.7;;bfloat16[426, 2048, 64]bfloat16[6, 2048, 64]int64[1, 2048];108.0;480.1;1016.1;588.1;bfloat16[426, 2048, 64]bfloat16[6, 2048, 64];108.0;;0.0;;;-14800.5;-14800.5;-14800.5;0.0;;0.0
transformer.h.0.self_attention.dense;True;;253.6;;bfloat16[6, 2048, 4544];106.5;586.6;693.1;693.1;bfloat16[6, 2048, 4544];106.5;;0.0;;;-14800.5;-14800.5;-14800.5;0.0;;0.0
transformer.h.0.self_attention;False;;9935



In [9]:
release_cached_memory()
get_used_and_max_gpu_memory()

(8519680, 17229268992)

In [10]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:01:46 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_id

*** Circular ref objects gc collected during the experiment:
cleared 2714 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:      417        0 MB (  0.00%)
GPU:      578        0 MB (  0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     969  29,606  31,785 MB   3.05% 
GPU:   1,906  22,657  24,564 MB   7.76% 




In [13]:
display_modules(model_benchmark.model)

---------------------
FalconForCausalLM
> submodules
- transformer: FalconModel
- lm_head: Linear
  ---------------------
  transformer#FalconModel
  > submodules
  - word_embeddings: Embedding
  - h: ModuleList
  - ln_f: LayerNorm
    ---------------------
    word_embeddings#Embedding
    > parameters
    - weight: bfloat16 [65024, 4544] (563.6 MB)
    ---------------------
    h#ModuleList
    > submodules
    - 0..31: 32X FalconDecoderLayer
      ---------------------
      0..31#FalconDecoderLayer
      > submodules
      - self_attention: FalconAttention
      - mlp: FalconMLP
      - input_layernorm: LayerNorm
        ---------------------
        self_attention#FalconAttention
        > submodules
        - maybe_rotary: FalconRotaryEmbedding
        - query_key_value: FalconLinear
        - dense: FalconLinear
        - attention_dropout: Dropout
          ---------------------
          maybe_rotary#FalconRotaryEmbedding
          > buffers
          - inv_freq: float32 [32] 

## Performances

### Native HF version

In [18]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     954  29,435  31,785 MB   3.00% 
GPU:   1,789  22,774  24,564 MB   7.28% 




In [19]:
model_id = "tiiuae/falcon-7b"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model files: 13.45 GB on disk
(cache path: /models/huggingface/transformers/models--tiiuae--falcon-7b/snapshots/898df1396f35e447d5fe44e0a3ccaaaa69f30d36)

Tokenizer load time : 277.75 ms
Tokenizer CPU memory: 15.63 MB

Model load time : 14560.34 ms
Model CPU memory: 0.00 GB
Model GPU memory: 12.94 GB
Max   GPU memory: 13.50 GB



In [20]:
model_benchmark.check_prefill(max_batch_size=8)

1,128,3156.07,40.56,12.96,0.02,12.98
2,128,5141.33,49.79,12.96,0.03,13.00
3,128,7377.56,52.05,12.96,0.05,13.01
4,128,7012.45,73.01,12.96,0.07,13.03
5,128,8208.17,77.97,12.96,0.08,13.05
6,128,8003.24,95.96,12.96,0.10,13.06
7,128,7595.15,117.97,12.96,0.12,13.08
8,128,9246.43,110.75,12.96,0.13,13.10
1,256,5730.10,44.68,12.96,0.03,13.00
2,256,7924.44,64.61,12.96,0.07,13.03
3,256,8084.97,94.99,12.96,0.10,13.06
4,256,8844.45,115.78,12.96,0.13,13.10
5,256,8469.60,151.13,12.96,0.17,13.13
6,256,9299.06,165.18,12.96,0.20,13.16
7,256,9068.66,197.60,12.96,0.23,13.20
8,256,8660.36,236.48,12.96,0.27,13.23
1,512,7660.64,66.84,12.96,0.10,13.06
2,512,7794.13,131.38,12.96,0.19,13.16
3,512,8482.23,181.08,12.96,0.29,13.25
4,512,8616.79,237.68,12.96,0.39,13.35
5,512,8790.54,291.22,12.96,0.48,13.44
6,512,8826.76,348.03,12.96,0.58,13.54
7,512,9029.24,396.93,12.96,0.68,13.64
8,512,9099.31,450.14,12.96,0.77,13.73
1,1024,7076.54,144.70,12.96,0.33,13.30
2,1024,7520.88,272.31,12.96,0.66,13.63
3,1024,7609.60,403.7

In [21]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:03:29 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 755 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:       15       15 MB ( 99.53%)
GPU:   12,798   12,798 MB (100.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     954  29,425  31,785 MB   3.00% 
GPU:   1,789  22,774  24,564 MB   7.28% 




### Compiled version

In [5]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     554  29,776  31,785 MB   1.74% 
GPU:   1,146  23,417  24,564 MB   4.67% 




In [6]:
model_id = "tiiuae/falcon-7b"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model files: 13.45 GB on disk
(cache path: /models/huggingface/transformers/models--tiiuae--falcon-7b/snapshots/898df1396f35e447d5fe44e0a3ccaaaa69f30d36)

Tokenizer load time : 280.54 ms
Tokenizer CPU memory: 30.21 MB

Model load time : 11613.38 ms
Model CPU memory: 0.02 GB
Model GPU memory: 12.94 GB
Max   GPU memory: 13.49 GB



In [7]:
model_benchmark.model = torch.compile(model_benchmark.model) #, fullgraph=True)

- ERROR with fulllgraph=True: https://github.com/huggingface/transformers/issues/27789
- kernel dies for unknown reason with: mode="max-autotune"
- AssertionError: anyway without parameters

Giving up for now

In [None]:
model_benchmark.check_prefill(max_batch_size=8)

In [9]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:13:37 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 29148 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:      381        0 MB (  0.00%)
GPU:   13,960        0 MB (  0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     936  28,589  31,785 MB   2.95% 
GPU:  15,106   9,457  24,564 MB  61.50% 




### Flash attention 2 version

In [9]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     933  29,415  31,785 MB   2.94% 
GPU:   1,788  22,775  24,564 MB   7.28% 




In [10]:
model_id = "tiiuae/falcon-7b"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto", use_flash_attention_2=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model files: 13.45 GB on disk
(cache path: /models/huggingface/transformers/models--tiiuae--falcon-7b/snapshots/898df1396f35e447d5fe44e0a3ccaaaa69f30d36)

Tokenizer load time : 153.23 ms
Tokenizer CPU memory: 27.83 MB

Model load time : 4795.27 ms
Model CPU memory: 0.00 GB
Model GPU memory: 12.94 GB
Max   GPU memory: 13.50 GB



In [11]:
model_benchmark.check_prefill(max_batch_size=40)

1,128,3619.80,35.36,12.96,0.02,12.98
2,128,6410.97,39.93,12.96,0.03,13.00
3,128,7467.92,51.42,12.96,0.05,13.01
4,128,7616.50,67.22,12.96,0.07,13.03
5,128,9064.32,70.61,12.96,0.08,13.05
6,128,8408.11,91.34,12.96,0.10,13.06
7,128,9773.70,91.67,12.96,0.12,13.08
8,128,9364.34,109.35,12.96,0.13,13.10
9,128,9504.79,121.20,12.96,0.15,13.11
10,128,9686.05,132.15,12.96,0.17,13.13
11,128,9831.35,143.22,12.96,0.18,13.15
12,128,10351.43,148.39,12.96,0.20,13.16
13,128,10337.61,160.97,12.96,0.22,13.18
14,128,10499.85,170.67,12.96,0.23,13.20
15,128,10066.29,190.74,12.96,0.25,13.21
16,128,10321.34,198.42,12.96,0.27,13.23
17,128,10607.14,205.14,12.96,0.28,13.25
18,128,10735.19,214.62,12.96,0.30,13.26
19,128,10687.54,227.55,12.96,0.32,13.28
20,128,10581.16,241.94,12.96,0.33,13.30
21,128,10751.49,250.01,12.96,0.35,13.31
22,128,10329.15,272.63,12.96,0.37,13.33
23,128,10377.48,283.69,12.96,0.38,13.35
24,128,10555.01,291.05,12.96,0.40,13.36
25,128,10853.33,294.84,12.96,0.41,13.38
26,128,10693.20,311.23,12.9

KeyboardInterrupt: 

In [12]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:07:19 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 1537 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:       28        0 MB (  0.00%)
GPU:   12,653        0 MB (  0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     962  29,387  31,785 MB   3.03% 
GPU:  14,442  10,121  24,564 MB  58.79% 




### Flash attention 2 / 8 bits version

In [5]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     554  29,752  31,785 MB   1.74% 
GPU:   1,130  23,433  24,564 MB   4.60% 




In [6]:
model_id = "tiiuae/falcon-7b"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto", use_flash_attention_2=True, load_in_8bit=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model files: 13.45 GB on disk
(cache path: /models/huggingface/transformers/models--tiiuae--falcon-7b/snapshots/898df1396f35e447d5fe44e0a3ccaaaa69f30d36)

Tokenizer load time : 259.53 ms
Tokenizer CPU memory: 29.71 MB

Model load time : 7416.93 ms
Model CPU memory: 0.09 GB
Model GPU memory: 6.73 GB
Max   GPU memory: 7.28 GB



In [None]:
model_benchmark.check_prefill(max_batch_size=64)

2023-12-05 23:04:23.953951: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-05 23:04:23.980645: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


1,128,1148.55,111.44,6.76,0.02,6.79
2,128,2320.54,110.32,6.76,0.05,6.81
3,128,3716.02,103.34,6.76,0.07,6.83
4,128,4417.32,115.91,6.76,0.10,6.86
5,128,6345.02,100.87,6.76,0.12,6.88
6,128,7732.24,99.32,6.76,0.14,6.90
7,128,6916.41,129.55,6.76,0.16,6.93
8,128,8820.65,116.09,6.76,0.19,6.95
9,128,8228.79,140.00,6.76,0.21,6.97
10,128,8684.90,147.38,6.76,0.23,7.00
11,128,9222.18,152.68,6.76,0.26,7.02
12,128,9365.54,164.01,6.76,0.28,7.05
13,128,9169.83,181.46,6.77,0.30,7.07
14,128,9362.42,191.40,6.77,0.33,7.09
15,128,9895.10,194.04,6.77,0.35,7.12
16,128,9190.62,222.84,6.77,0.38,7.14
17,128,9376.95,232.06,6.77,0.40,7.16
18,128,10648.52,216.37,6.77,0.42,7.19
19,128,9683.60,251.15,6.77,0.45,7.21
20,128,9377.15,273.00,6.77,0.47,7.23
21,128,10274.52,261.62,6.77,0.49,7.26
22,128,10311.49,273.09,6.77,0.51,7.28
23,128,9760.48,301.62,6.77,0.54,7.30
24,128,10799.48,284.46,6.77,0.56,7.33
25,128,10819.12,295.77,6.77,0.58,7.35
26,128,10748.89,309.61,6.77,0.61,7.37
27,128,11067.99,312.25,6.77,0.63,7.40
28,1

In [8]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:01:48 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 1929 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:    1,037        0 MB (  0.00%)
GPU:    8,976        0 MB (  0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:   1,595  29,032  31,785 MB   5.02% 
GPU:  10,106  14,457  24,564 MB  41.14% 




### Flash attention 2 / 4 bits version

In [5]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     567  29,722  31,785 MB   1.79% 
GPU:   1,109  23,454  24,564 MB   4.52% 




In [6]:
model_id = "tiiuae/falcon-7b"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto", use_flash_attention_2=True, load_in_4bit=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model files: 13.45 GB on disk
(cache path: /models/huggingface/transformers/models--tiiuae--falcon-7b/snapshots/898df1396f35e447d5fe44e0a3ccaaaa69f30d36)

Tokenizer load time : 329.99 ms
Tokenizer CPU memory: 30.09 MB

Model load time : 21322.12 ms
Model CPU memory: 0.09 GB
Model GPU memory: 4.06 GB
Max   GPU memory: 4.61 GB



In [7]:
model_benchmark.check_prefill(max_batch_size=64)

2023-12-05 23:22:41.297748: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-05 23:22:41.350516: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


1,128,1495.51,85.59,4.08,0.32,4.40
2,128,2775.95,92.22,4.08,0.33,4.41
3,128,3707.59,103.57,4.08,0.33,4.42
4,128,4313.38,118.70,4.08,0.34,4.43
5,128,4922.25,130.02,4.08,0.35,4.43
6,128,5392.55,142.42,4.08,0.36,4.44
7,128,6129.16,146.19,4.08,0.37,4.45
8,128,6297.46,162.61,4.08,0.38,4.46
9,128,6543.13,176.06,4.08,0.39,4.47
10,128,5980.11,214.04,4.08,0.40,4.48
11,128,7045.90,199.83,4.08,0.40,4.49
12,128,7336.65,209.36,4.08,0.41,4.50
13,128,7563.31,220.01,4.08,0.42,4.50
14,128,8080.85,221.76,4.08,0.43,4.51
15,128,7720.86,248.68,4.08,0.44,4.52
16,128,8246.93,248.33,4.08,0.45,4.53
17,128,8415.62,258.57,4.08,0.46,4.54
18,128,8558.20,269.22,4.08,0.46,4.55
19,128,8535.33,284.93,4.08,0.48,4.56
20,128,8816.31,290.37,4.08,0.48,4.56
21,128,8934.01,300.87,4.08,0.49,4.58
22,128,8659.50,325.19,4.08,0.50,4.58
23,128,8808.82,334.21,4.08,0.51,4.59
24,128,8945.24,343.42,4.08,0.52,4.60
25,128,9292.41,344.37,4.08,0.53,4.61
26,128,9410.15,353.66,4.08,0.53,4.62
27,128,9312.89,371.10,4.08,0.54,4.63
28,128,9347.

KeyboardInterrupt: 

In [12]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:07:19 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 1537 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:       28        0 MB (  0.00%)
GPU:   12,653        0 MB (  0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     962  29,387  31,785 MB   3.03% 
GPU:  14,442  10,121  24,564 MB  58.79% 




# Falcon-40B / 4 bits instruct models

https://huggingface.co/TheBloke/falcon-40b-instruct-GPTQ

https://huggingface.co/TheBloke/alfred-40B-1023-GPTQ

https://huggingface.co/TheBloke/alfred-40B-1023-AWQ

In [None]:
pip install optimum

In [None]:
pip install auto-gptq

#### TheBloke/falcon-40b-instruct-GPTQ

In [None]:
with IPyExperimentsPytorch(cl_enable=False):
    model_id = "TheBloke/falcon-40b-instruct-GPTQ"
    ModelForCausalLMBenchmark.download_in_local_cache(model_id, trust_remote_code=True)

In [5]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     551  29,849  31,785 MB   1.73% 
GPU:   1,105  23,458  24,564 MB   4.50% 




**IMPORTANT**

Need to fix : /models/huggingface/modules/transformers_modules/TheBloke/falcon-40b-instruct-GPTQ/57ac6eae1469d42d37781df19576896490023ec2/modelling_RW.py

With the changes in : https://huggingface.co/tiiuae/falcon-40b/discussions/13/files

In [7]:
model_id = "TheBloke/falcon-40b-instruct-GPTQ"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto", trust_remote_code=True)

2023-12-09 11:06:53.016213: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-09 11:06:53.052381: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model files: 21.00 GB on disk
(cache path: /models/huggingface/transformers/models--TheBloke--falcon-40b-instruct-GPTQ/snapshots/57ac6eae1469d42d37781df19576896490023ec2)

Tokenizer load time : 231.64 ms
Tokenizer CPU memory: 30.17 MB

Model load time : 18364.91 ms
Model CPU memory: 0.54 GB
Model GPU memory: 20.63 GB
Max   GPU memory: 21.00 GB



In [None]:
model_benchmark.check_prefill(max_batch_size=1)

**ERROR** the fix above was not enough, the repository is tooo old, giving up

#### TheBloke/alfred-40B-1023-GPTQ

In [None]:
with IPyExperimentsPytorch(cl_enable=False):
    model_id = "TheBloke/alfred-40B-1023-GPTQ"
    ModelForCausalLMBenchmark.download_in_local_cache(model_id, trust_remote_code=True)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     562  29,800  31,785 MB   1.77% 
GPU:   1,105  23,458  24,564 MB   4.50% 


Loading model TheBloke/alfred-40B-1023-GPTQ in local cache ...


2023-12-09 11:46:40.902901: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-09 11:46:40.940174: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     563  29,786  31,785 MB   1.77% 
GPU:   1,105  23,458  24,564 MB   4.50% 




In [12]:
model_id = "TheBloke/alfred-40B-1023-GPTQ"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", trust_remote_code=True)

Model files: 21.00 GB on disk
(cache path: /models/huggingface/transformers/models--TheBloke--alfred-40B-1023-GPTQ/snapshots/f8e310f64befd66c681321b01f7b89043a3a7ee3)

Tokenizer load time : 239.32 ms
Tokenizer CPU memory: 14.94 MB

Model load time : 8866.06 ms
Model CPU memory: 0.00 GB
Model GPU memory: 20.63 GB
Max   GPU memory: 21.01 GB



In [13]:
from auto_gptq import exllama_set_max_input_length
model = exllama_set_max_input_length(model_benchmark.model, max_input_length=8192)

In [8]:
model_benchmark.check_prefill(max_batch_size=1)

1,128,456.85,280.18,21.13,0.02,21.15
1,256,949.31,269.67,21.02,0.05,21.07
1,512,1515.01,337.95,21.02,0.10,21.12
1,1024,1974.00,518.74,21.02,0.20,21.22
1,2048,2287.64,895.24,21.03,0.39,21.42
1,4096,2243.64,1825.60,21.05,0.79,21.84
1,8192,168.28,48681.63,21.07,1.62,22.70


In [9]:
model_benchmark.check_prefill(max_batch_size=1)

1,128,520.69,245.83,21.13,0.02,21.15
1,256,961.42,266.27,21.02,0.05,21.07
1,512,1559.34,328.34,21.02,0.10,21.12
1,1024,1991.83,514.10,21.02,0.20,21.22
1,2048,2319.98,882.76,21.03,0.39,21.42
1,4096,2304.45,1777.43,21.05,0.79,21.84
1,8192,111.78,73286.95,21.07,1.62,22.70


In [16]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model_benchmark.model,
    tokenizer=model_benchmark.tokenizer
)

sequences = pipeline(
   "<start_system>You are Alfred, a helpful assistant trained by LightOn. Knowledge cutoff: November 2022. Current date: 16 November, 2023<end_message><start_user>Write me an email to my boss, explaining how the company could benefit by using LightOns platform for Large Language Models, Paradigm.<end_message><start_assistant>",
    max_length=1000,
    do_sample=True,
    top_k=3,
    num_return_sequences=1,
    eos_token_id=model_benchmark.tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")



Result: <start_system>You are Alfred, a helpful assistant trained by LightOn. Knowledge cutoff: November 2022. Current date: 16 November, 2023<end_message><start_user>Write me an email to my boss, explaining how the company could benefit by using LightOns platform for Large Language Models, Paradigm.<end_message><start_assistant>Subject: Benefits of using LightOn's Paradigm platform for Large Language Models

Dear [Boss's Name],

I would like to bring to your attention the benefits of using LightOn's platform, Paradigm, for Large Language Models. As you are aware, Large Language Models are becoming increasingly important in various fields such as natural language processing, conversational AI, and machine translation.

By using LightOn's platform, our company can benefit in several ways. First and foremost, it can help us save costs. The platform provides a scalable and cost-effective solution for training and deploying Large Language Models. This means that we can reduce the amount of

In [None]:
del exp

#### TheBloke/alfred-40B-1023-AWQ

In [None]:
pip install autoawq

In [5]:
with IPyExperimentsPytorch(cl_enable=False):
    model_id = "TheBloke/alfred-40B-1023-AWQ"
    ModelForCausalLMBenchmark.download_in_local_cache(model_id, trust_remote_code=True)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     562  29,687  31,785 MB   1.77% 
GPU:   1,094  23,469  24,564 MB   4.46% 


Loading model TheBloke/alfred-40B-1023-AWQ in local cache ...


You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.


Downloading (…)fetensors.index.json:   0%|          | 0.00/86.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/11.0G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/2.40G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

--> model files size   : 21.72 GB
--> stored in directory: /models/huggingface/transformers/models--TheBloke--alfred-40B-1023-AWQ/snapshots/159a50a0b23df16b1955eb6b1afc34152c345430


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:28:27 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_id

*** Circular ref objects gc collected during the experiment:
cleared 105 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:      254       78 MB ( 31.01%)
GPU:        0        0 MB (100.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     737  29,559  31,785 MB   2.32% 
GPU:   1,094  23,469  24,564 MB   4.46% 




In [None]:
pip install vllm

In [None]:
from vllm import LLM, SamplingParams

model_id = "TheBloke/alfred-40B-1023-AWQ"
llm = LLM(model=model_id, trust_remote_code=True, max_model_len=8192, download_dir="/models/huggingface/transformers", quantization="AWQ")

You are using a model of type RefinedWeb to instantiate a model of type falcon. This is not supported for all configurations of models and can yield errors.


INFO 12-09 15:14:33 llm_engine.py:73] Initializing an LLM engine with config: model='TheBloke/alfred-40B-1023-AWQ', tokenizer='TheBloke/alfred-40B-1023-AWQ', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir='/models/huggingface/transformers', load_format=auto, tensor_parallel_size=1, quantization=awq, seed=0)


**too slow** the function call above was interrupted after more than 5 minutes

# Llama-2-7B

meta-llama/Llama-2-7b-hf

https://huggingface.co/meta-llama/Llama-2-7b-hf

In [6]:
with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

In [8]:
with IPyExperimentsPytorch(cl_enable=False):
    model_id = "meta-llama/Llama-2-7b-hf"
    ModelForCausalLMBenchmark.download_in_local_cache(model_id, token=myhftoken)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     554  29,977  31,785 MB   1.74% 
GPU:   1,328  23,235  24,564 MB   5.41% 


Loading model meta-llama/Llama-2-7b-hf in local cache ...


Downloading tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Downloading generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

--> model files size   : 12.55 GB
--> stored in directory: /models/huggingface/transformers/models--meta-llama--Llama-2-7b-hf/snapshots/8cca527612d856d7d32bd94f8103728d614eb852


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:14:24 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_id

*** Circular ref objects gc collected during the experiment:
cleared 105 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:     -344        0 MB (  0.18%)
GPU:        0        0 MB (100.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     210  30,201  31,785 MB   0.66% 
GPU:   1,328  23,235  24,564 MB   5.41% 




## Analysis

In [15]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     739  29,709  31,785 MB   2.33% 
GPU:   1,422  23,141  24,564 MB   5.79% 




In [16]:
model_id = "meta-llama/Llama-2-7b-hf"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto", token=myhftoken)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model files: 12.55 GB on disk
(cache path: /models/huggingface/transformers/models--meta-llama--Llama-2-7b-hf/snapshots/8cca527612d856d7d32bd94f8103728d614eb852)

Tokenizer load time : 160.45 ms
Tokenizer CPU memory: 13.38 MB

Model load time : 6971.62 ms
Model CPU memory: 0.14 GB
Model GPU memory: 12.61 GB
Max   GPU memory: 12.62 GB



In [17]:
display_modules(model_benchmark.model)

---------------------
LlamaForCausalLM
> submodules
- model: LlamaModel
- lm_head: Linear
  ---------------------
  model#LlamaModel
  > submodules
  - embed_tokens: Embedding
  - layers: ModuleList
  - norm: LlamaRMSNorm
    ---------------------
    embed_tokens#Embedding
    > parameters
    - weight: float16 [32000, 4096] (250.0 MB)
    ---------------------
    layers#ModuleList
    > submodules
    - 0..31: 32X LlamaDecoderLayer
      ---------------------
      0..31#LlamaDecoderLayer
      > submodules
      - self_attn: LlamaAttention
      - mlp: LlamaMLP
      - input_layernorm: LlamaRMSNorm
      - post_attention_layernorm: LlamaRMSNorm
        ---------------------
        self_attn#LlamaAttention
        > submodules
        - q_proj: Linear
        - k_proj: Linear
        - v_proj: Linear
        - o_proj: Linear
        - rotary_emb: LlamaRotaryEmbedding
          ---------------------
          q_proj#Linear
          > parameters
          - weight: float16 [4096, 40

In [12]:
try:
    model_benchmark.trace_prefill(6, 2048)
finally:
    del model_benchmark

Prefill test for batch size 6 and sequence length 2048:
model.embed_tokens;True;;502.3;;int64[6, 2048];0.1;0.1;96.1;96.1;float16[6, 2048, 4096];96.0;;0.0;;;-14424.7;-14424.7;-14424.7;0.0;;0.0
model.layers.0.input_layernorm;True;;295.4;;float16[6, 2048, 4096];96.0;144.1;528.2;240.1;float16[6, 2048, 4096];96.0;;0.0;;;-14424.7;-14424.7;-14424.7;0.0;;0.0
model.layers.0.self_attn.q_proj;True;;192.4;;float16[6, 2048, 4096];96.0;240.1;336.1;336.1;float16[6, 2048, 4096];96.0;;0.0;;;-14424.7;-14424.7;-14424.7;0.0;;0.0
model.layers.0.self_attn.k_proj;True;;130.3;;float16[6, 2048, 4096];96.0;336.1;432.1;432.1;float16[6, 2048, 4096];96.0;;0.0;;;-14424.7;-14424.7;-14424.7;0.0;;0.0
model.layers.0.self_attn.v_proj;True;;127.8;;float16[6, 2048, 4096];96.0;432.1;528.1;528.1;float16[6, 2048, 4096];96.0;;0.0;;;-14424.7;-14424.7;-14424.7;0.0;;0.0
model.layers.0.self_attn.rotary_emb;True;;126.5;;float16[6, 32, 2048, 128];96.0;528.1;528.1;528.1;float16[2048, 128]float16[2048, 128];1.0;;0.0;;;-14424.7;-14424



In [18]:
release_cached_memory()
get_used_and_max_gpu_memory()

(13552476160, 13552476160)

In [19]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:00:58 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_benchmark, model_id

*** Circular ref objects gc collected during the experiment:
cleared 517 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:      158      145 MB ( 92.39%)
GPU:   12,918   12,918 MB (100.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:     751  29,685  31,785 MB   2.36% 
GPU:   1,422  23,141  24,564 MB   5.79% 




# Mistral-7B

mistralai/Mistral-7B-v0.1

https://huggingface.co/mistralai/Mistral-7B-v0.1

In [None]:
with IPyExperimentsPytorch(cl_enable=False):
    model_id = "mistralai/Mistral-7B-v0.1"
    ModelForCausalLMBenchmark.download_in_local_cache(model_id)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     577  30,026  31,785 MB   1.82% 
GPU:   1,344  23,219  24,564 MB   5.47% 


Loading model mistralai/Mistral-7B-v0.1 in local cache ...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Analysis

In [5]:
exp = IPyExperimentsPytorch(cl_enable=False)


*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 4090 (24564 RAM)


*** Current state:
RAM:    Used    Free   Total       Util
CPU:     551  29,953  31,785 MB   1.74% 
GPU:     996  23,567  24,564 MB   4.06% 




In [5]:
record_memory_history(True)

In [6]:
model_id = "mistralai/Mistral-7B-v0.1"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="auto", torch_dtype="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model files: 13.98 GB on disk
(cache path: /models/huggingface/transformers/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658)

Tokenizer load time : 301.49 ms
Tokenizer CPU memory: 42.26 MB

Model load time : 55007.52 ms
Model CPU memory: 1.87 GB
Model GPU memory: 13.99 GB
Max   GPU memory: 13.99 GB



In [8]:
dump_memory_snapshot()

Dumped memory snapshot to file: memory_snapshot_20231202_023305.pickle


#### Modules

In [7]:
display_modules(model_benchmark.model)

---------------------
MistralForCausalLM
> submodules
- model: MistralModel
- lm_head: Linear
  ---------------------
  model#MistralModel
  > submodules
  - embed_tokens: Embedding
  - layers: ModuleList
  - norm: MistralRMSNorm
    ---------------------
    embed_tokens#Embedding
    > parameters
    - weight: bfloat16 [32000, 4096] (250.0 MB)
    ---------------------
    layers#ModuleList
    > submodules
    - 0..31: 32X MistralDecoderLayer
      ---------------------
      0..31#MistralDecoderLayer
      > submodules
      - self_attn: MistralAttention
      - mlp: MistralMLP
      - input_layernorm: MistralRMSNorm
      - post_attention_layernorm: MistralRMSNorm
        ---------------------
        self_attn#MistralAttention
        > submodules
        - q_proj: Linear
        - k_proj: Linear
        - v_proj: Linear
        - o_proj: Linear
        - rotary_emb: MistralRotaryEmbedding
          ---------------------
          q_proj#Linear
          > parameters
          - 

#### Perf test

In [9]:
release_cached_memory()
get_used_and_max_gpu_memory()

(15020351488, 15020351488)

In [10]:
try:
    model_benchmark.trace_prefill(2, 4096)
finally:
    del model_benchmark

Prefill test for batch size 2 and sequence length 4096:
model.embed_tokens;True;;516.3;;int64[2, 4096];0.1;0.1;64.1;64.1;bfloat16[2, 4096, 4096];64.0;;0.0;;;-15332.7;-15332.7;-15332.7;0.0;;0.0
model.layers.0.input_layernorm;True;;487.2;;bfloat16[2, 4096, 4096];64.0;128.1;384.1;192.1;bfloat16[2, 4096, 4096];64.0;;0.0;;;-15332.7;-15332.7;-15332.7;0.0;;0.0
model.layers.0.self_attn.q_proj;True;;381.7;;bfloat16[2, 4096, 4096];64.0;192.1;256.1;256.1;bfloat16[2, 4096, 4096];64.0;;0.0;;;-15332.7;-15332.7;-15332.7;0.0;;0.0
model.layers.0.self_attn.k_proj;True;;251.0;;bfloat16[2, 4096, 4096];64.0;256.1;272.1;272.1;bfloat16[2, 4096, 1024];16.0;;0.0;;;-15332.7;-15332.7;-15332.7;0.0;;0.0
model.layers.0.self_attn.v_proj;True;;148.7;;bfloat16[2, 4096, 4096];64.0;272.1;288.1;288.1;bfloat16[2, 4096, 1024];16.0;;0.0;;;-15332.7;-15332.7;-15332.7;0.0;;0.0
model.layers.0.self_attn.rotary_emb;True;;236.7;;bfloat16[2, 8, 4096, 128];16.0;288.1;288.1;288.1;bfloat16[4096, 128]bfloat16[4096, 128];2.0;;0.0;;;-153



In [14]:
dump_memory_snapshot()

Dumped memory snapshot to file: memory_snapshot_20231202_022710.pickle


In [15]:
display_memory_snapshot()

In [8]:
del exp


IPyExperimentsPytorch: Finishing

*** Experiment finished in 00:24:30 (elapsed wallclock time)

*** Newly defined local variables:
Deleted: model_id

*** Circular ref objects gc collected during the experiment:
cleared 1049 objects (only temporary leakage)

*** Experiment memory:
RAM: Consumed       Reclaimed
CPU:    2,002        0 MB (  0.00%)
GPU:     -162        0 MB ( -0.00%)

*** Current state:
RAM:    Used    Free   Total       Util
CPU:   2,571  27,941  31,785 MB   8.09% 
GPU:     910  23,653  24,564 MB   3.71% 


