# Load a french dataset

In [None]:
pip install datasets

In [1]:
from datasets import load_dataset

dataset_path = "frenchtext/banque-fr-2311"

In [None]:
dataset = load_dataset(dataset_path)

In [3]:
dataset["valid"][0]["Text"]

"# Les nouvelles normes européennes sur le paiement pourraient affecter l'e-commerce\r\n\r\nicone ecommerce\r\n\r\nLes nouvelles règles européennes sur la sécurisation des paiements en ligne entreront en vigueur à partir du 14 septembre 2019. Elles ont notamment été pensées pour limiter les fraudes dans le domaine. Les banques et les acteurs du secteur interpellent toutefois les autorités sur les perturbations pouvant être induites par le déploiement de cette nouvelle norme.\r\n\r\nLes plateformes d'e-commerce sont généralement débordées en fin d’année, notamment avec Thanksgiving, le Black Friday et les achats de Noël. Cette période s’annonce encore plus compliquée pour 2019.\r\n\r\nEn effet, les nouvelles normes de sécurité pour le paiement en ligne seront appliquées en Europe à compter de mi-septembre. Elles concerneront notamment les banques, les fournisseurs de services de paiement et les e-commerçants.\r\n\r\nAu regard de cette échéance, les protagonistes de ces différents secteu

# Get popular tokenizers

In [None]:
pip install --upgrade transformers

In [None]:
pip install sentencepiece

In [None]:
pip install tiktoken

In [2]:
models = { 
    #"rmkv_world_1b5" : "BlinkDL/rwkv-5-world",

    "btlm_3b" : "cerebras/btlm-3b-8k-base",
    "redpajama_3b" : "togethercomputer/RedPajama-INCITE-Base-3B-v1",
    "open_llama_3b" : "openlm-research/open_llama_3b_v2",
    "stablelm_3b" : "stabilityai/stablelm-3b-4e1t",

    "yi_6b" : "01-ai/Yi-6B",
    "mistral_7b" : "mistralai/Mistral-7B-v0.1",
    "mpt_7b" : "mosaicml/mpt-7b",
    "falcon_7b" : "tiiuae/falcon-7b",
    "redpajama_7b" : "togethercomputer/RedPajama-INCITE-7B-Base",
    "llama2_7b_32k" : "togethercomputer/LLaMA-2-7B-32K",
    "open_llama_7b" : "openlm-research/open_llama_7b_v2",
    "mpt_7b_8k" : "mosaicml/mpt-7b-8k",
    "qwen_7b" : "Qwen/Qwen-7B",
    "llama2_7b" : "meta-llama/Llama-2-7b-hf",
    "bloomz_7b" : "bigscience/bloomz-7b1-mt",

    "llama2_13b" : "meta-llama/Llama-2-13b-hf",
    "qwen_14b" : "Qwen/Qwen-14B",

    "mpt_30b" : "mosaicml/mpt-30b",
    "yi_34b" : "01-ai/Yi-34B",
    "falcon_40b" : "tiiuae/falcon-40b"
}

In [2]:
from transformers import AutoTokenizer
import json

models_tokenizers = {}
models_tokenizers_config = {}
models_tokenizers_specialtokens = {}
for model in models.keys():
    print("------------------------")
    print(f"Loading {model} tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(models[model], trust_remote_code=True)
    models_tokenizers[model] = tokenizer
    config = {}
    specialtokens = {}
    config["type"] = type(tokenizer)
    config["vocab_size"] = tokenizer.vocab_size
    config["model_max_length"] = tokenizer.model_max_length
    if hasattr(tokenizer, "special_tokens"): specialtokens["special_tokens"] = tokenizer.special_tokens
    config["padding_side"] = tokenizer.padding_side
    config["truncation_side"] = tokenizer.truncation_side
    config["clean_up_tokenization_spaces"] = tokenizer.clean_up_tokenization_spaces
    if tokenizer.is_fast:
        backend_config = json.loads(tokenizer.backend_tokenizer.to_str())
        if "vocab" in backend_config["model"]: del backend_config["model"]["vocab"]
        if "merges" in backend_config["model"]: del backend_config["model"]["merges"]
        config['truncation'] = backend_config['truncation']
        config['padding'] = backend_config['padding']
        specialtokens['added_tokens'] = backend_config['added_tokens']
        config['normalizer'] = backend_config['normalizer']
        config['pre_tokenizer'] = backend_config['pre_tokenizer']
        config['model'] = backend_config['model']
        config['post_processor'] = backend_config['post_processor']
        config['decoder'] = backend_config['decoder']
    elif model[:3]=="yi_":
        config['model'] = type(tokenizer.sp_model)
    models_tokenizers_config[model] = config
    models_tokenizers_specialtokens[model] = specialtokens
    print(config)

------------------------
Loading btlm_3b tokenizer
{'type': <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>, 'vocab_size': 50257, 'model_max_length': 8192, 'padding_side': 'right', 'truncation_side': 'right', 'clean_up_tokenization_spaces': True, 'truncation': None, 'padding': None, 'normalizer': None, 'pre_tokenizer': {'type': 'ByteLevel', 'add_prefix_space': False, 'trim_offsets': True, 'use_regex': True}, 'model': {'type': 'BPE', 'dropout': None, 'unk_token': None, 'continuing_subword_prefix': '', 'end_of_word_suffix': '', 'fuse_unk': False, 'byte_fallback': False}, 'post_processor': {'type': 'ByteLevel', 'add_prefix_space': True, 'trim_offsets': False, 'use_regex': True}, 'decoder': {'type': 'ByteLevel', 'add_prefix_space': True, 'trim_offsets': True, 'use_regex': True}}
------------------------
Loading redpajama_3b tokenizer
{'type': <class 'transformers.models.gpt_neox.tokenization_gpt_neox_fast.GPTNeoXTokenizerFast'>, 'vocab_size': 50254, 'model_max_l

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


{'type': <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>, 'vocab_size': 32000, 'model_max_length': 2048, 'padding_side': 'left', 'truncation_side': 'right', 'clean_up_tokenization_spaces': False, 'truncation': None, 'padding': None, 'normalizer': {'type': 'Sequence', 'normalizers': [{'type': 'Prepend', 'prepend': '▁'}, {'type': 'Replace', 'pattern': {'String': ' '}, 'content': '▁'}]}, 'pre_tokenizer': None, 'model': {'type': 'BPE', 'dropout': None, 'unk_token': '<unk>', 'continuing_subword_prefix': None, 'end_of_word_suffix': None, 'fuse_unk': True, 'byte_fallback': True}, 'post_processor': {'type': 'TemplateProcessing', 'single': [{'SpecialToken': {'id': '<s>', 'type_id': 0}}, {'Sequence': {'id': 'A', 'type_id': 0}}], 'pair': [{'SpecialToken': {'id': '<s>', 'type_id': 0}}, {'Sequence': {'id': 'A', 'type_id': 0}}, {'SpecialToken': {'id': '<s>', 'type_id': 1}}, {'Sequence': {'id': 'B', 'type_id': 1}}], 'special_tokens': {'<s>': {'id': '<s>', 'ids': [1], 't

# Test tokenizers on french dataset

In [39]:
tokenizer = models_tokenizers["falcon_7b"]

def tokenization(example):
    return tokenizer(example["Text"])

In [40]:
dataset = load_dataset(dataset_path, split="train+valid+test")
dataset = dataset.map(tokenization, batched=True)

Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

In [41]:
words = 0
tokens = 0

for example in dataset:
    words += example['Words']
    tokens += len(example['input_ids'])
    
words, tokens

(67061556, 131722778)

In [43]:
tokenizer.vocab_size, tokens/words

(65024, 1.9642070040844266)

In [6]:
dataset = load_dataset(dataset_path, split="train+valid+test")
    
for model in models:
    tokenizer = models_tokenizers[model]

    def tokenization(example):
        return tokenizer(example["Text"])
    
    dataset = dataset.map(tokenization, batched=True)
    
    words = 0
    tokens = 0
    for example in dataset:
        words += example['Words']
        tokens += len(example['input_ids'])
        
    print("------------------------")
    print(f"{model}: {tokenizer.vocab_size} vocab => {tokens/words} tokens per word")

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

  table = cls._concat_blocks(blocks, axis=0)


------------------------
btlm_3b: 50257 vocab => 2.340823750048388 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
redpajama_3b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3326 > 2048). Running this sequence through the model will result in indexing errors


------------------------
open_llama_3b: 32000 vocab => 2.4012566752850173 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
stablelm_3b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4374 > 4096). Running this sequence through the model will result in indexing errors


------------------------
yi_6b: 64000 vocab => 2.4612755630066205 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
mistral_7b: 32000 vocab => 2.234656410298622 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
mpt_7b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3038 > 2048). Running this sequence through the model will result in indexing errors


------------------------
falcon_7b: 65024 vocab => 1.9642070040844266 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2765 > 2048). Running this sequence through the model will result in indexing errors


------------------------
redpajama_7b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (127882 > 32768). Running this sequence through the model will result in indexing errors


------------------------
llama2_7b_32k: 32000 vocab => 2.1736836675844504 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3326 > 2048). Running this sequence through the model will result in indexing errors


------------------------
open_llama_7b: 32000 vocab => 2.4012566752850173 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9937 > 8192). Running this sequence through the model will result in indexing errors


------------------------
mpt_7b_8k: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (10460 > 8192). Running this sequence through the model will result in indexing errors


------------------------
qwen_7b: 151851 vocab => 1.8874039844825552 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
llama2_7b: 32000 vocab => 2.1749545745702648 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
bloomz_7b: 250680 vocab => 1.4450713759161806 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
llama2_13b: 32000 vocab => 2.1749545745702648 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (10460 > 8192). Running this sequence through the model will result in indexing errors


------------------------
qwen_14b: 151851 vocab => 1.8874039844825552 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (9937 > 8192). Running this sequence through the model will result in indexing errors


------------------------
mpt_30b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4374 > 4096). Running this sequence through the model will result in indexing errors


------------------------
yi_34b: 64000 vocab => 2.4612755630066205 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3038 > 2048). Running this sequence through the model will result in indexing errors


------------------------
falcon_40b: 65024 vocab => 1.9642070040844266 tokens per word


In [7]:
other_models = { 
    "qwen_14b" : "Qwen/Qwen-14B",

    "mpt_30b" : "mosaicml/mpt-30b",
    "yi_34b" : "01-ai/Yi-34B",
    "falcon_40b" : "tiiuae/falcon-40b"
}

In [8]:
for model in other_models:
    tokenizer = models_tokenizers[model]

    def tokenization(example):
        return tokenizer(example["Text"])
    
    dataset = dataset.map(tokenization, batched=True)
    
    words = 0
    tokens = 0
    for example in dataset:
        words += example['Words']
        tokens += len(example['input_ids'])
        
    print("------------------------")
    print(f"{model}: {tokenizer.vocab_size} vocab => {tokens/words} tokens per word")

Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
qwen_14b: 151851 vocab => 1.8874039844825552 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
mpt_30b: 50254 vocab => 2.0548674265774567 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
yi_34b: 64000 vocab => 2.4612755630066205 tokens per word


Map:   0%|          | 0/85229 [00:00<?, ? examples/s]

------------------------
falcon_40b: 65024 vocab => 1.9642070040844266 tokens per word


# Train a tokenizer on french dataset

In [10]:
def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["Text"]

In [17]:
# Basic byte-level BPE
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
tokenizer = Tokenizer(models.BPE())
# tokenizer.normalizer = None
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
    vocab_size=32000,
    min_frequency=100,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
    show_progress=True
)

In [18]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(dataset))






In [None]:
def tokenization(examples):
    return {'input_ids': [enc.ids for enc in tokenizer.encode_batch(examples["Text"])]}

dataset = dataset.map(tokenization, batched=True)

words = 0
tokens = 0
for example in dataset:
    words += example['Words']
    tokens += len(example['input_ids'])

print("------------------------")
print(f"custom: {tokens/words} tokens per word")

In [38]:
print(f"custom: {tokens/words} tokens per word")

custom: 1.4874704070391687 tokens per word


In [None]:
[token for token in tokenizer.get_vocab().keys() if len(token)>=10]

In [45]:
from collections import Counter

tokens_counts = Counter()

for example in dataset:
    tokens_counts.update(example['input_ids'])

In [47]:
len(tokens_counts)

31877

In [51]:
len(tokens_counts) - len([{tokenizer.decode([token]):tokens_counts[token]} for token in tokens_counts.keys() if tokens_counts[token]<100])

30087

In [52]:
 len([{tokenizer.decode([token]):tokens_counts[token]} for token in tokens_counts.keys() if tokens_counts[token]<100])

1790

# Test model perplexity

In [None]:
pip install --upgrade accelerate bitsandbytes

In [2]:
models = { 
    #"rmkv_world_1b5" : "BlinkDL/rwkv-5-world",

    "btlm_3b" : "cerebras/btlm-3b-8k-base",
    "redpajama_3b" : "togethercomputer/RedPajama-INCITE-Base-3B-v1",
    "open_llama_3b" : "openlm-research/open_llama_3b_v2",
    "stablelm_3b" : "stabilityai/stablelm-3b-4e1t",

    "yi_6b" : "01-ai/Yi-6B",
    "mistral_7b" : "mistralai/Mistral-7B-v0.1",
    "mpt_7b" : "mosaicml/mpt-7b",
    "falcon_7b" : "tiiuae/falcon-7b",
    "redpajama_7b" : "togethercomputer/RedPajama-INCITE-7B-Base",
    "llama2_7b_32k" : "togethercomputer/LLaMA-2-7B-32K",
    "open_llama_7b" : "openlm-research/open_llama_7b_v2",
    "mpt_7b_8k" : "mosaicml/mpt-7b-8k",
    "qwen_7b" : "Qwen/Qwen-7B",
    "llama2_7b" : "meta-llama/Llama-2-7b-hf",
    "bloomz_7b" : "bigscience/bloomz-7b1-mt",

    "llama2_13b" : "meta-llama/Llama-2-13b-hf",
    "qwen_14b" : "Qwen/Qwen-14B",

    "mpt_30b" : "mosaicml/mpt-30b",
    "yi_34b" : "01-ai/Yi-34B",
    "falcon_40b" : "tiiuae/falcon-40b"
}

In [3]:
model_id = models["redpajama_3b"]

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)     

In [4]:
from datasets import load_dataset
dataset_path = "frenchtext/banque-fr-2311"
dataset = load_dataset(dataset_path, split="train+valid+test")

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

In [5]:
text_example = dataset[0]["Text"]
len(text_example)

5375

In [6]:
encodings = tokenizer(text_example, return_tensors="pt")

In [7]:
import torch

with torch.no_grad():
    tokenize_input = tokenizer.encode(text_example[:2048], return_tensors="pt")
    loss = model(tokenize_input, labels=tokenize_input)[0]
torch.exp(loss.float()).item()

2023-11-17 23:50:07.766211: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-17 23:50:07.786800: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


6.375988006591797

In [8]:
for i in range(10):
    print(len(dataset[i]["Text"]))

5375
3689
3804
2104
7936
3802
10669
2171
3568
1862


In [9]:
tokenizer.pad_token = tokenizer.eos_token

In [10]:
encodings = tokenizer(text = dataset[0:10]["Text"], add_special_tokens=True, 
                      padding="longest", truncation=True, return_overflowing_tokens=True, stride=int(tokenizer.model_max_length/2),
                      # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                      # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                      pad_to_multiple_of=16)

In [11]:
for sample_mapping,input_ids,attention_mask in zip(encodings["overflow_to_sample_mapping"],encodings["input_ids"],encodings["attention_mask"]):
    print(sample_mapping,len(input_ids),sum(attention_mask))  

0 2048 1630
1 2048 1160
2 2048 1162
3 2048 658
4 2048 2048
4 2048 1741
5 2048 1175
6 2048 2048
6 2048 2048
6 2048 1234
7 2048 648
8 2048 1072
9 2048 585


In [12]:
sorted_dataset = dataset.sort("Words").filter(lambda example: example["Words"]>0)

In [13]:
def get_dataset_batches(batch_size=32):
    dataset_length = len(sorted_dataset)
    for start_idx in range(0, dataset_length, batch_size):
        end_idx = min(start_idx + batch_size, dataset_length)
        yield sorted_dataset[start_idx:end_idx]

In [14]:
def encode_dataset_batch(dataset_batch):
    return tokenizer(text = dataset_batch["Text"], add_special_tokens=True, 
                      padding="longest", truncation=True, return_overflowing_tokens=True, stride=int(tokenizer.model_max_length/2),
                      # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                      # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                      pad_to_multiple_of=16,
                      return_tensors="pt")

In [15]:
for idx,dataset_batch in enumerate(get_dataset_batches()):
    encodings = encode_dataset_batch(dataset_batch)
    print(encodings["attention_mask"].sum(axis=1))
    print(encodings["input_ids"].size())
    if idx == 3: break

tensor([ 5,  4,  6,  5,  7, 23,  7,  7,  7,  7,  7, 10,  7,  7, 13,  6,  4,  6,
         6, 15,  4,  4,  4, 12, 12, 12, 12, 12, 12, 12, 12, 12])
torch.Size([32, 32])
tensor([12, 12, 12, 12, 12, 12, 12,  6, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12])
torch.Size([32, 16])
tensor([12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
         6, 12, 12, 12,  7, 12, 12, 12, 12, 12, 12, 12, 12, 47])
torch.Size([32, 48])
tensor([45, 45, 44, 48, 55, 39, 51, 46, 43, 42, 40, 47, 41, 44, 47, 42, 40, 62,
        43, 55, 34, 44, 42, 53, 53, 30, 56, 52, 51, 53, 37, 41])
torch.Size([32, 64])


In [None]:
from tqdm import tqdm

batch_size = 32

nlls = []
for idx,dataset_batch in enumerate(get_dataset_batches(batch_size)):
    
    encodings = encode_dataset_batch(dataset_batch)

    with torch.no_grad():
        outputs = model(encodings["input_ids"], attention_mask=encodings["attention_mask"], labels=encodings["input_ids"])
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)
    
    if idx%10==0: print(f"{(idx+1)*batch_size} / {len(dataset)}: {neg_log_likelihood}")

perplexity = torch.exp(torch.stack(nlls).mean().float())
print(perplexity)

15 min

32 / 85229: 7.71484375
352 / 85229: 7.09765625
672 / 85229: 8.1015625
992 / 85229: 5.12109375
1312 / 85229: 5.78125
1632 / 85229: 5.51953125
1952 / 85229: 6.12109375
2272 / 85229: 3.36328125
2592 / 85229: 5.26171875
2912 / 85229: 5.0703125
3232 / 85229: 4.87109375
3552 / 85229: 4.74609375
3872 / 85229: 4.046875
4192 / 85229: 4.3046875
4512 / 85229: 4.11328125
4832 / 85229: 4.56640625
5152 / 85229: 4.51171875
5472 / 85229: 4.359375
5792 / 85229: 4.32421875
6112 / 85229: 4.68359375
6432 / 85229: 4.6015625
6752 / 85229: 3.658203125
7072 / 85229: 4.7734375
7392 / 85229: 6.6328125
7712 / 85229: 3.869140625
8032 / 85229: 4.9375
8352 / 85229: 4.359375
8672 / 85229: 5.0078125
8992 / 85229: 4.09375
9312 / 85229: 3.513671875
9632 / 85229: 3.578125
9952 / 85229: 3.609375
10272 / 85229: 3.505859375
10592 / 85229: 2.99609375
10912 / 85229: 3.5859375
11232 / 85229: 3.18359375
11552 / 85229: 3.48046875
11872 / 85229: 3.11328125
12192 / 85229: 3.40234375
12512 / 85229: 3.189453125
12832 / 85229: 4.3515625
13152 / 85229: 3.306640625
13472 / 85229: 3.40234375
13792 / 85229: 3.162109375
14112 / 85229: 3.451171875
14432 / 85229: 2.845703125
14752 / 85229: 4.3515625
15072 / 85229: 4.2890625
15392 / 85229: 2.91015625
15712 / 85229: 3.1328125
16032 / 85229: 4.02734375
16352 / 85229: 3.126953125
16672 / 85229: 3.478515625
16992 / 85229: 3.193359375
17312 / 85229: 3.3203125
17632 / 85229: 3.0078125

OOM

In [17]:
perplexity = torch.exp(torch.stack(nlls).mean().float())
print(perplexity)

tensor(64.5841)


# Memory study

## CUDA helper functions

[CUDA semantics / Memory management](https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management)

[Understanding CUDA Memory Usage](https://pytorch.org/docs/stable/torch_cuda_memory.html#torch-cuda-memory)

[CUDA memory management API](https://pytorch.org/docs/stable/cuda.html#cuda-memory-management-api)

In [5]:
import torch
from datetime import datetime
from IPython.display import HTML
import pickle

memory_unit = 1024*1024
total_memory = torch.cuda.get_device_properties(0).total_memory

def display_memory():
    print(torch.cuda.get_device_name(0))
    print(f"Total    : {(total_memory/memory_unit):8,.1f} MB")
    print("------------------------------")
    free_memory = torch.cuda.mem_get_info()[0]
    reserved_memory = torch.cuda.memory_reserved(0)
    used_memory = torch.cuda.memory_allocated(0)    
    max_used_memory = torch.cuda.max_memory_allocated(0)
    overhead_memory = total_memory - free_memory - reserved_memory
    print(f"Overhead : {(overhead_memory/memory_unit):8,.1f} MB - {int(overhead_memory/total_memory*100):3} %")
    print(f"Reserved : {(reserved_memory/memory_unit):8,.1f} MB - {int(reserved_memory/total_memory*100):3} %")
    print(f"Free     : {(free_memory/memory_unit):8,.1f} MB - {int(free_memory/total_memory*100):3} %")
    print("------------------------------")
    print(f"Used     : {(used_memory/memory_unit):8,.1f} MB - {int(used_memory/total_memory*100):3} %")
    print(f"Max used : {(max_used_memory/memory_unit):8,.1f} MB - {int(max_used_memory/total_memory*100):3} %")
    
def display_memory_summary():
    print(torch.cuda.memory_summary())
    
def release_cached_memory():
    torch.cuda.empty_cache()
    
def reset_peak_memory_stats():
    torch.cuda.reset_peak_memory_stats()

def record_memory_history(enabled):
    torch.cuda.memory._record_memory_history(enabled=enabled)
    
def dump_memory_snapshot():
    filename_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"memory_snapshot_{filename_datetime}.pickle"
    s = torch.cuda.memory._snapshot(0)
    with open(filename, "wb") as f:
        pickle.dump(s, f)
    print(f"Dumped memory snapshot to file: {filename}")
    
def display_memory_snapshot():
    url = "https://pytorch.org/memory_viz"
    return HTML(f"Call dump_memory_snapshot(), <a href='{url}' target='_blank'>click here to open Pytorch memory viz</a>, then drag and drop the snapshot file")
    
display_memory()

NVIDIA GeForce RTX 4090
Total    : 24,563.5 MB
------------------------------
Overhead :  1,633.5 MB -   6 %
Reserved :  7,442.0 MB -  30 %
Free     : 15,488.0 MB -  63 %
------------------------------
Used     :  6,718.8 MB -  27 %
Max used :  6,921.0 MB -  28 %


In [6]:
record_memory_history(True)

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)   

In [19]:
dump_memory_snapshot()

Dumped memory snapshot to file: memory_snapshot_20231118_130500.pickle


In [20]:
display_memory_snapshot()

## Pytorch models exploration

In [7]:
from collections import OrderedDict

def find_attribute_origin(obj, attr_name):
    for cls in obj.__class__.__mro__:
        if attr_name in dir(cls):
            return cls.__name__
    return obj.__class__.__name__

def display_members(obj):
    obj_attributes = {}
    for member_name in dir(obj):
        if member_name[0:1]!="_":
            obj_attributes[getattr(obj,member_name).__qualname__ if hasattr(getattr(obj,member_name),"__qualname__") else f"{find_attribute_origin(obj,member_name)}.{member_name}"] = str(type(getattr(obj,member_name)))
    obj_attributes = {k: obj_attributes[k] for k in sorted(obj_attributes)}
    for member_name in obj_attributes.keys():
        print(member_name, obj_attributes[member_name])

In [82]:
display_members(model)

GPTNeoXConfig <class 'type'>
GPTNeoXForCausalLM.T_destination <class 'typing.TypeVar'>
GPTNeoXForCausalLM.base_model <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXModel'>
GPTNeoXForCausalLM.base_model_prefix <class 'str'>
GPTNeoXForCausalLM.call_super_init <class 'bool'>
GPTNeoXForCausalLM.config <class 'transformers.models.gpt_neox.configuration_gpt_neox.GPTNeoXConfig'>
GPTNeoXForCausalLM.device <class 'torch.device'>
GPTNeoXForCausalLM.dtype <class 'torch.dtype'>
GPTNeoXForCausalLM.dummy_inputs <class 'dict'>
GPTNeoXForCausalLM.dump_patches <class 'bool'>
GPTNeoXForCausalLM.embed_out <class 'torch.nn.modules.linear.Linear'>
GPTNeoXForCausalLM.forward <class 'functools.partial'>
GPTNeoXForCausalLM.framework <class 'str'>
GPTNeoXForCausalLM.generation_config <class 'transformers.generation.configuration_utils.GenerationConfig'>
GPTNeoXForCausalLM.get_output_embeddings <class 'method'>
GPTNeoXForCausalLM.gpt_neox <class 'transformers.models.gpt_neox.modeling_gpt_neox.GPT

In [11]:
from torch.nn import ModuleList
import inspect

memory_unit_mb = 1024*1024

def display_modules(module, name_prefix=None, depth=0, max_depth=99, forward_methods=None):
    if forward_methods is None:
        forward_methods = {}
    header = module.__class__.__name__
    if name_prefix is not None:
        header = f"{name_prefix}#{header}" 
    depth_prefix = "  "*depth
    print(depth_prefix+"---------------------")
    print(depth_prefix+header)
    if len(list(module.named_parameters(recurse=False))) > 0:
        print(depth_prefix+"> parameters")
        for name,parameter in module.named_parameters(recurse=False):
            print(depth_prefix+f"- {name}: {get_tensor_description(parameter)}")
    if len(list(module.named_buffers(recurse=False))) > 0:
        print(depth_prefix+"> buffers")
        for name,buffer in module.named_buffers(recurse=False):
            print(depth_prefix+f"- {name}: {get_tensor_description(buffer)}")
    if len(list(module.named_children())) > 0:
        print(depth_prefix+"> submodules")
        for name,submodule in module.named_children():
            print(depth_prefix+f"- {name}: {submodule.__class__.__name__}")
    source_code = inspect.getsource(module.forward)
    forward_methods[module.__class__.__name__] = source_code
    if depth < max_depth:
        for name,submodule in module.named_children():
            if isinstance(submodule, ModuleList):
                display_module_list(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
            else:
                display_modules(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
    if depth==0:
        print()
        print()
        for module_type,source_code in forward_methods.items():
            print("---------------------")
            print(f"{module_type}.forward()")
            print("---------------------")
            print(source_code)
            
def display_module_list(module_list, name_prefix=None, depth=0, max_depth=1, forward_methods=None):
    # ------------------------------
    # Detect repeated layers in ModuleList: code inspired from Pytorch: ModuleList.__repr__    
    list_of_reprs = [repr(item) for item in module_list]
    if len(list_of_reprs) == 0:
        return

    start_end_indices = [[0, 0]]
    repeated_blocks = [list_of_reprs[0]]
    for i, r in enumerate(list_of_reprs[1:], 1):
        if r == repeated_blocks[-1]:
            start_end_indices[-1][1] += 1
            continue

        start_end_indices.append([i, i])
        repeated_blocks.append(r)
    # -------------------------------
    
    depth_prefix = "  "*depth
    print(depth_prefix+"---------------------")
    print(depth_prefix+f"{name_prefix}#ModuleList")
    print(depth_prefix+"> submodules")
    named_submodules = []
    for (start_id, end_id) in start_end_indices:
        submodule = module_list[start_id]
        if start_id != end_id:      
            name = f"{start_id}..{end_id}"
            print(depth_prefix+f"- {name}: {(end_id-start_id+1)}X {submodule.__class__.__name__}")
        else:
            name = str(start_id)
            print(depth_prefix+f"- {name}: {submodule.__class__.__name__}")        
        named_submodules.append((name,submodule))
    if depth < max_depth:
        for name,submodule in named_submodules:
            if isinstance(submodule, ModuleList):
                display_module_list(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
            else:
                display_modules(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)

def get_tensor_description(t):
    dtype = str(t.dtype)[6:]
    dimensions = str(t.size())[11:-1]
    total_byte_size = t.numel() * t.element_size()
    return f"{dtype} {dimensions} ({(total_byte_size/memory_unit_mb):.1f} MB)"

In [195]:
display_modules(model)

---------------------
GPTNeoXForCausalLM
> submodules
- gpt_neox: GPTNeoXModel
- embed_out: Linear
  ---------------------
  gpt_neox#GPTNeoXModel
  > submodules
  - embed_in: Embedding
  - emb_dropout: Dropout
  - layers: ModuleList
  - final_layer_norm: LayerNorm
    ---------------------
    embed_in#Embedding
    > parameters
    - weight: float16 [50432, 2560] (246.2 MB)
    ---------------------
    emb_dropout#Dropout
    ---------------------
    layers#ModuleList
    > submodules
    - 0..31: 32X GPTNeoXLayer
      ---------------------
      0..31#GPTNeoXLayer
      > submodules
      - input_layernorm: LayerNorm
      - post_attention_layernorm: LayerNorm
      - post_attention_dropout: Dropout
      - post_mlp_dropout: Dropout
      - attention: GPTNeoXAttention
      - mlp: GPTNeoXMLP
        ---------------------
        input_layernorm#LayerNorm
        > parameters
        - weight: float16 [2560] (0.0 MB)
        - bias: float16 [2560] (0.0 MB)
        ----------------

# Huggingface language models performance

In [None]:
pip install psutil

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import psutil
import torch
from transformers.utils.hub import cached_file

def get_model_path_and_size_on_disk(model):    
    model_config_file = cached_file(model.name_or_path, "config.json", local_files_only=True)
    model_directory = os.path.dirname(model_config_file)
    
    total_size = 0
    for entry in os.listdir(model_directory):
        full_entry_path = os.path.join(model_directory, entry)
        if os.path.isfile(full_entry_path):
            total_size += os.path.getsize(full_entry_path)
    return model_directory,total_size

def get_used_cpu_memory():
    process = psutil.Process(os.getpid())
    process_memory = process.memory_info().rss
    return process_memory

def get_used_and_max_gpu_memory():
    used_memory = torch.cuda.memory_allocated(0)    
    max_used_memory = torch.cuda.max_memory_allocated(0)
    return used_memory,max_used_memory

def reset_max_gpu_memory():
    torch.cuda.reset_peak_memory_stats()

In [None]:
path,size = get_model_path_and_size_on_disk(model)
print(f"Model files: {(size/1024/1024/1024):.2f} GB stored in {path}")
print()
cpu_memory = get_used_cpu_memory()
print(f"The current process uses {(cpu_memory/1024/1024/1024):.2f} GB of CPU memory")
print()
gpu_memory, max_gpu_memory = get_used_and_max_gpu_memory()
print(f"The current process uses {(gpu_memory/1024/1024/1024):.2f} GB of GPU memory")
print(f"The maximum use of GPU so far was {(gpu_memory/1024/1024/1024):.2f} GB")

In [2]:
from time import perf_counter_ns
from transformers import AutoModelForCausalLM, AutoTokenizer

memory_unit_mb = 1024*1024
memory_unit_gb = 1024*1024*1024

time_unit_µs = 1000
time_unit_ms = 1000*1000
time_unit_s = 1000*1000*1000

def get_tensor_params_size_and_dim(param):
    if param is None:
        return 0,""
    elif isinstance(param, torch.Tensor):
        psize = param.numel() * param.element_size()
        pdim = f"{str(param.dtype)[6:]}{str(param.size())[11:-1]}"
        return psize,pdim
    elif isinstance(param, dict):
        size = 0
        dim = ""
        for value in param.values():            
            psize, pdim = get_tensor_params_size_and_dim(value)
            size += psize
            dim += pdim
        return size, dim
    else:
        try:
            iter(param)
            size = 0
            dim = ""
            for value in param:            
                psize, pdim = get_tensor_params_size_and_dim(value)
                size += psize
                dim += pdim
            return size, dim
        except TypeError:
            return 0,""

class ModulePerf:
    
    def __init__(self, module_name, module, is_leaf_module):
        self.module_name = module_name
        self.module = module
        self.is_leaf_module = is_leaf_module
        
        self.before_forward_time_ns = 0
        self.before_forward_used_memory = 0
        self.forward_inputs_memory_size = 0 
        self.forward_inputs_memory_dim = "" 
        
        self.after_forward_time_ns = 0
        self.after_forward_used_memory = 0
        self.forward_max_used_memory = 0        
        self.forward_outputs_memory_size = 0
        self.forward_outputs_memory_dim = ""
        
        self.before_backward_time_ns = 0
        self.before_backward_used_memory = 0
        self.backward_inputs_memory_size = 0
        self.backward_inputs_memory_dim = "" 
        
        self.after_backward_time_ns = 0
        self.after_backward_used_memory = 0
        self.backward_max_used_memory = 0
        self.backward_outputs_memory_size = 0
        self.backward_outputs_memory_dim = ""
        
    def before_forward(self, module, args, kwargs):
        self.before_forward_time_ns = perf_counter_ns()
        self.before_forward_used_memory,_ = get_used_and_max_gpu_memory()   
        args_size,args_dim = get_tensor_params_size_and_dim(args)
        kwargs_size,kwargs_dim = get_tensor_params_size_and_dim(kwargs) 
        self.forward_inputs_memory_size = args_size + kwargs_size
        self.forward_inputs_memory_dim = args_dim + kwargs_dim
        if self.is_leaf_module: reset_max_gpu_memory()
        
    def after_forward(self, module, args, kwargs, output):
        self.after_forward_time_ns = perf_counter_ns()
        self.after_forward_used_memory, self.forward_max_used_memory = get_used_and_max_gpu_memory()
        self.forward_outputs_memory_size, self.forward_outputs_memory_dim = get_tensor_params_size_and_dim(output) 
        
    def before_backward(self, module, grad_output):
        self.before_backward_time_ns = perf_counter_ns()
        self.before_backward_used_memory,_ = get_used_and_max_gpu_memory()
        self.backward_inputs_memory_size, self.backward_inputs_memory_dim = get_tensor_params_size_and_dim(grad_output) 
        
    def after_backward(self, module, grad_input, grad_output):
        self.after_backward_time_ns = perf_counter_ns()
        self.after_backward_used_memory, self.backward_max_used_memory = get_used_and_max_gpu_memory()
        self.backward_outputs_memory_size, self.backward_outputs_memory_dim = get_tensor_params_size_and_dim(grad_input) 
    
    def get_stats_line(self, initial_used_memory):
        return f"{self.module_name};{self.is_leaf_module};;{(self.after_forward_time_ns-self.before_forward_time_ns)/time_unit_µs:.1f};;{self.forward_inputs_memory_dim};{self.forward_inputs_memory_size/memory_unit_mb:.1f};{(self.before_forward_used_memory-initial_used_memory)/memory_unit_mb:.1f};{(self.forward_max_used_memory-initial_used_memory)/memory_unit_mb:.1f};{(self.after_forward_used_memory-initial_used_memory)/memory_unit_mb:.1f};{self.forward_outputs_memory_dim};{self.forward_outputs_memory_size/memory_unit_mb:.1f};;{(self.after_backward_time_ns-self.before_backward_time_ns)/time_unit_µs:.1f};;{self.backward_inputs_memory_dim};{(self.backward_inputs_memory_size-initial_used_memory)/memory_unit_mb:.1f};{(self.before_backward_used_memory-initial_used_memory)/memory_unit_mb:.1f};{(self.backward_max_used_memory-initial_used_memory)/memory_unit_mb:.1f};{self.after_backward_used_memory/memory_unit_mb:.1f};{self.backward_outputs_memory_dim};{self.backward_outputs_memory_size/memory_unit_mb:.1f}"
    
class ModelForCausalLMBenchmark:   
    
    @staticmethod
    def download_in_local_cache(pretrained_model_id, **kwargs):
        print(f"Loading model {pretrained_model_id} in local cache ...")
        AutoTokenizer.from_pretrained(pretrained_model_id)
        AutoModelForCausalLM.from_pretrained(pretrained_model_id, **kwargs)
        print("--> Reset the kernel now before going further")
    
    def __init__(self, pretrained_model_id):
        self.pretrained_model_id = pretrained_model_id
        self.tokenizer = None 
        self.model = None
        
        self.model_path = None
        self.model_size_on_disk = 0
        self.tokenizer_load_time_ns = 0
        self.tokenizer_cpu_memory = 0
        self.model_load_time_ns = 0
        self.model_cpu_memory = 0
        self.model_gpu_memory = 0
        self.model_load_max_gpu_memory = 0
        
    def trace_load_from_cache(self, **kwargs):
        cpu_memory_before = get_used_cpu_memory()
        gpu_memory_before = get_used_and_max_gpu_memory()[0]
        reset_max_gpu_memory()        
        time_before = perf_counter_ns()
        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_id)
        cpu_memory_tokenizer = get_used_cpu_memory()
        time_tokenizer = perf_counter_ns()
        self.model = AutoModelForCausalLM.from_pretrained(self.pretrained_model_id, **kwargs)
        cpu_memory_model = get_used_cpu_memory()
        gpu_memory_model,max_gpu_memory_model = get_used_and_max_gpu_memory()     
        time_model = perf_counter_ns()
        
        self.model_path,self.model_size_on_disk = get_model_path_and_size_on_disk(self.model)
        self.tokenizer_load_time_ns = time_tokenizer-time_before
        self.tokenizer_cpu_memory = cpu_memory_tokenizer-cpu_memory_before
        self.model_load_time_ns = time_model-time_tokenizer
        self.model_cpu_memory = cpu_memory_model-cpu_memory_tokenizer
        self.model_gpu_memory = gpu_memory_model-gpu_memory_before
        self.model_load_max_gpu_memory = max_gpu_memory_model
        
        self.display_load_results()            
    
    def display_load_results(self):
        print(f"Model files: {(self.model_size_on_disk/1024/1024/1024):.2f} GB on disk")
        print(""f"(cache path: {self.model_path})")
        print()
        print(f"Tokenizer load time : {(self.tokenizer_load_time_ns/time_unit_ms):.2f} ms")
        print(f"Tokenizer CPU memory: {(self.tokenizer_cpu_memory/memory_unit_mb):.2f} MB")
        print()
        print(f"Model load time : {(self.model_load_time_ns/time_unit_ms):.2f} ms")
        print(f"Model CPU memory: {(self.model_cpu_memory/memory_unit_gb):.2f} GB")
        print(f"Model GPU memory: {(self.model_gpu_memory/memory_unit_gb):.2f} GB")
        print(f"Max   GPU memory: {(self.model_load_max_gpu_memory/memory_unit_gb):.2f} GB")
        print()
        
    def trace_prefill(self, batch_size, seq_length):
        # warmup
        input_ids = torch.randint(low=0, high=self.tokenizer.vocab_size, size=(batch_size,seq_length), dtype=torch.int64).to(self.model.device)
        attention_mask = torch.ones(batch_size,seq_length).to(self.model.device)
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)        
        
        # measure perfs
        moduleperfs = []
        hookhandles = []
        try:
            for module_name,module in self.model.named_modules():
                if module_name=="": module_name="<model>"
                mperf = ModulePerf(module_name, module, len(list(module.children())) == 0)
                moduleperfs.append(mperf)                
                hookhandles.append(module.register_forward_pre_hook(mperf.before_forward, with_kwargs=True))
                hookhandles.append(module.register_forward_hook(mperf.after_forward, with_kwargs=True))
                hookhandles.append(module.register_full_backward_pre_hook(mperf.before_backward))
                hookhandles.append(module.register_full_backward_hook(mperf.after_backward))
            
            # perf test
            input_ids = torch.randint(low=0, high=self.tokenizer.vocab_size, size=(batch_size,seq_length), dtype=torch.int64).to(self.model.device)
            attention_mask = torch.ones(batch_size,seq_length).to(self.model.device)
            self.model.eval()
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)
        finally:
            for handle in hookhandles:
                handle.remove()    
                
        # sort modules
        sorted_moduleperfs = sorted(moduleperfs, key=lambda mp: mp.after_forward_time_ns)
        first_mperf = None
        for mperf in sorted_moduleperfs:
            if mperf.before_forward_used_memory>0:
                first_mperf = mperf
                break
        initial_used_memory = first_mperf.before_forward_used_memory - first_mperf.forward_inputs_memory_size
        
        # display results
        print(f"Prefill test for batch size {batch_size} and sequence length {seq_length}:")
        for mperf in sorted_moduleperfs:
            if mperf.after_forward_time_ns>0:
                print(mperf.get_stats_line(initial_used_memory))
    
    def check_prefill(self):
        seq_length = self.tokenizer.model_max_length
        batch_size = 1
        
        # warmup
        input_ids = torch.randint(low=0, high=self.tokenizer.vocab_size, size=(batch_size,seq_length), dtype=torch.int64).to(self.model.device)
        attention_mask = torch.ones(batch_size,seq_length).to(self.model.device)
        self.model.eval()
        with torch.no_grad():
            self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)        

        # perf test
        base_gpu_memory,_ = get_used_and_max_gpu_memory()
        for batch_size in range(7,100):
            print(f"--- {batch_size} x {seq_length} ---")
            reset_max_gpu_memory()
            initial_gpu_memory,_ = get_used_and_max_gpu_memory()
            input_ids = torch.randint(low=0, high=self.tokenizer.vocab_size, size=(batch_size,seq_length), dtype=torch.int64).to(self.model.device)
            attention_mask = torch.ones(batch_size,seq_length).to(self.model.device)
            before_forward_time_ns = perf_counter_ns()
            with torch.no_grad():
                self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)        
            after_forward_time_ns = perf_counter_ns()
            gpu_memory, max_gpu_memory = get_used_and_max_gpu_memory()
            print(f"Forward pass  : {(after_forward_time_ns-before_forward_time_ns)/time_unit_ms:.1f} ms")
            print(f"Initial memory  : {((initial_gpu_memory-base_gpu_memory)/memory_unit_gb):.2f} GB")
            print(f"Maximum memory: {((max_gpu_memory-initial_gpu_memory)/memory_unit_gb):.2f} GB")
            print(f"Final memory  : {((gpu_memory-base_gpu_memory)/memory_unit_gb):.2f} GB")
    
    def trace_generate(self):
        return
    
    def trace_train(self):
        return

In [4]:
model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
ModelForCausalLMBenchmark.download_in_local_cache(model_id)

Loading model togethercomputer/RedPajama-INCITE-Base-3B-v1 in local cache ...
--> Reset the kernel now before going further


In [3]:
model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
model_benchmark = ModelForCausalLMBenchmark(model_id)
model_benchmark.trace_load_from_cache(device_map="cuda", torch_dtype=torch.float16)

Model files: 5.30 GB on disk
(cache path: /models/huggingface/transformers/models--togethercomputer--RedPajama-INCITE-Base-3B-v1/snapshots/094fbdd0c911feb485ce55de1952ab2e75277e1e)

Tokenizer load time : 241.08 ms
Tokenizer CPU memory: 43.74 MB

Model load time : 3607.25 ms
Model CPU memory: 0.35 GB
Model GPU memory: 5.33 GB
Max   GPU memory: 5.33 GB



In [None]:
model_benchmark.trace_prefill(2, 1000)

In [4]:
model_benchmark.check_prefill()

--- 7 x 2048 ---
Forward pass  : 1381.7 ms
Initial memory  : 0.00 GB
Maximum memory: 4.18 GB
Final memory  : 0.00 GB
--- 8 x 2048 ---
Forward pass  : 1576.7 ms
Initial memory  : 0.00 GB
Maximum memory: 4.78 GB
Final memory  : 0.00 GB
--- 9 x 2048 ---
Forward pass  : 1775.4 ms
Initial memory  : 0.00 GB
Maximum memory: 5.38 GB
Final memory  : 0.00 GB
--- 10 x 2048 ---
Forward pass  : 10473.8 ms
Initial memory  : 0.00 GB
Maximum memory: 5.98 GB
Final memory  : 0.00 GB
--- 11 x 2048 ---


KeyboardInterrupt: 

In [7]:
logits = outputs['logits']
past_key_values = outputs['past_key_values']

In [10]:
logits.size(),logits.dtype

(torch.Size([2, 1000, 50432]), torch.float16)

In [14]:
len(past_key_values), past_key_values[0][0].size(), past_key_values[0][0].dtype, past_key_values[0][1].size(), past_key_values[0][1].dtype

(32,
 torch.Size([2, 32, 1000, 80]),
 torch.float16,
 torch.Size([2, 32, 1000, 80]),
 torch.float16)

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "togethercomputer/RedPajama-INCITE-Base-3B-v1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
encodings = tokenizer(["un test","un deuxième test"], padding="longest", return_tensors="pt")

In [10]:
encodings

{'input_ids': tensor([[  328,  1071,     0,     0,     0],
        [  328, 23156,    74, 22722,  1071]]), 'attention_mask': tensor([[1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1]])}

In [13]:
encodings["input_ids"].size(), encodings["input_ids"].dtype

(torch.Size([2, 5]), torch.int64)

In [14]:
encodings["attention_mask"].size(), encodings["attention_mask"].dtype

(torch.Size([2, 5]), torch.int64)

In [22]:
output["logits"].size(), output["logits"].dtype

(torch.Size([2, 5, 50432]), torch.float16)

In [21]:
output.keys()

odict_keys(['logits', 'past_key_values'])

In [27]:
len(output["past_key_values"]),output["past_key_values"][0][1].size(),output["past_key_values"][0][1].dtype

(32, torch.Size([2, 32, 5, 80]), torch.float16)