# Most popular open weights LLMs - December 2023

## Dependencies

In [None]:
pip install datasets

In [1]:
import flash_attn
flash_attn.__version__

'2.4.2'

## Models dictionary

In [1]:
models = { 
    "tinyllama_1b": "TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T", # 4.10 GB
     
    "redpajama_3b" : "togethercomputer/RedPajama-INCITE-Base-3B-v1", # 5.30 GB
    "btlm_3b" : "cerebras/btlm-3b-8k-base", #  4.93 GB
    "openllama2_3b" : "openlm-research/open_llama_3b_v2", #  6.38 GB
    "stablelm_3b" : "stabilityai/stablelm-3b-4e1t", # 5.21 GB
    "phi2_3b" : "microsoft/phi-2", # 5.18 GB

    "bloomz_7b" : "bigscience/bloomz-7b1-mt", # 13.18 GB
    "falcon_7b" : "tiiuae/falcon-7b", # 13.45 GB       
    "redpajama_7b" : "togethercomputer/RedPajama-INCITE-7B-Base", # 12.90 GB
    "mpt_7b" : "mosaicml/mpt-7b", # 12.39 GB
    "mpt_7b_8k" : "mosaicml/mpt-7b-8k", # 12.39 GB
    "llama2_7b" : "meta-llama/Llama-2-7b-hf", # 12.55 GB
    "llama2_7b_32k" : "togethercomputer/LLaMA-2-7B-32K", # 12.55 GB
    "mistral_7b" : "mistralai/Mistral-7B-v0.1", # 13.49 GB
    "qwen_7b" : "Qwen/Qwen-7B", # 14.38 GB
    "yi_6b" : "01-ai/Yi-6B", # 11.29 GB
    "decilm_7b" : "Deci/DeciLM-7B", # 13.12 GB
    
    "openllama1_13b" : "openlm-research/open_llama_13b", # 24.24 GB
    "llama2_13b" : "meta-llama/Llama-2-13b-hf", # 24.25 GB
    "qwen_14b" : "Qwen/Qwen-14B", # 26.39 GB
    "solar_10b" : "upstage/SOLAR-10.7B-v1.0", # 19.99 GB
    
    "llama1_33b" : "TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ", # 15.78 GB https://huggingface.co/alexl83/LLaMA-33B-HF
    "falcon_40b" : "TheBloke/falcon-40b-instruct-GPTQ", # 21.00 GB https://huggingface.co/tiiuae/falcon-40b
    "mpt_30b" : "abhinavkulkarni/mosaicml-mpt-30b-instruct-w4-g128-awq", # 15.00 GB https://huggingface.co/mosaicml/mpt-30b
    "codellama_34b" : "TheBloke/CodeLlama-34B-Instruct-GPTQ", # 17.07 GB https://huggingface.co/codellama/CodeLlama-34b-hf
    "yi_34b" : "TheBloke/Yi-34B-GPTQ", # 17.33 GB https://huggingface.co/01-ai/Yi-34B    
    "mixtral_8x7B" : "TheBloke/Mixtral-8x7B-v0.1-GPTQ" # 22.18 GB https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
}

## Load dataset

In [2]:
with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

In [3]:
from datasets import load_dataset

In [4]:
#dataset_name_fr = "frenchtext/banque-fr-2311"
#dataset_fr = load_dataset(dataset_name_fr, token=myhftoken)

dataset_name_en = "frenchtext/bank-en-2401"
dataset_en = load_dataset(dataset_name_en, token=myhftoken)

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

In [5]:
dataset_en

DatasetDict({
    train: Dataset({
        features: ['Uri', 'ExtractedFromPDF', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars'],
        num_rows: 20451
    })
    valid: Dataset({
        features: ['Uri', 'ExtractedFromPDF', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars'],
        num_rows: 2555
    })
    test: Dataset({
        features: ['Uri', 'ExtractedFromPDF', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars'],
        num_rows: 2579
    })
})

## Batching and tokenization

In [6]:
dataset_name = dataset_name_en
split = "valid"
dataset = dataset_en[split]

In [7]:
def get_dataset_batches(dataset, batch_size=32):
    filtered_dataset = dataset.filter(lambda example: example["Words"]>15)
    sorted_dataset = filtered_dataset.sort("Words",reverse=True)
    
    dataset_length = len(sorted_dataset)
    for start_idx in range(0, dataset_length, batch_size):
        end_idx = min(start_idx + batch_size, dataset_length)
        yield sorted_dataset[start_idx:end_idx]

In [8]:
def get_encoding_offsets(encoding):
    start_index = encoding.offsets[0][0]
    end_index = encoding.offsets[-1][1]
    if end_index==0: end_index = -1
    return (start_index, end_index)

In [9]:
def encode_dataset_batch(tokenizer, dataset_batch, stride=256):
    encodings = tokenizer(text = dataset_batch["Text"], add_special_tokens=True, 
                      padding="longest", truncation=True, return_overflowing_tokens=True, stride=stride,
                      # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                      # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                      pad_to_multiple_of=16, return_tensors="pt")

    encodings["overflow_to_sample_uri"] = list(map(lambda sample_id: dataset_batch["Uri"][sample_id.item()], encodings["overflow_to_sample_mapping"]))
    encodings["overflow_to_sample_offset"] = list(map(get_encoding_offsets, encodings.encodings))
    
    return encodings

In [10]:
def get_encodings_batches(tokenizer, dataset, batch_size=32, stride=256):
    for dataset_batch in get_dataset_batches(dataset, batch_size):
        encodings = encode_dataset_batch(tokenizer, dataset_batch, stride)
        
        encodings_length = len(encodings.encodings)
        for start_idx in range(0, encodings_length, batch_size):
            end_idx = min(start_idx + batch_size, encodings_length)
            yield {key: encodings[key][start_idx:end_idx] for key in encodings.data.keys()}

## Load model

In [11]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = list(models)[0]
model_name = models[model_id]
print(f"Computing perplexity on dataset {dataset_name}:{split} for {model_name}")

if model_id=="stablelm_3b":
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=myhftoken)
elif model_id=="qwen_7b":
    # https://github.com/QwenLM/Qwen/blob/main/tokenization_note.md#special-tokens
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, pad_token = '<|endoftext|>')
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if model_id=="tinyllama_1b":
    # torch_dtype="auto" loads the model in fp32, which is not compatible with flash attention
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True, device_map="auto", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
elif model_id=="btlm_3b":
    # no flash attention support as of 01/07/2024, using device_map triggers a fatal error
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True, torch_dtype="auto", attn_implementation="eager", trust_remote_code=True).to('cuda')
    # max context length supported without flahs attention on a RTX 4090
    tokenizer.model_max_length = 4096
elif model_id=="stablelm_3b":
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True, device_map="auto", torch_dtype="auto", attn_implementation="flash_attention_2", trust_remote_code=True, token=myhftoken)
elif model_id=="phi2_3b" or model_id=="qwen_7b":
    # no flash attention support for phi2 as of 01/07/2024
    # for qwen: latest version of flash_attn installed, but module dropout_layer_norm not found
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True, device_map="auto", torch_dtype="auto", attn_implementation="eager", trust_remote_code=True)
elif model_id=="bloomz_7b" or model_id=="mpt_7b":
    # no flash attention support as of 01/08/2024
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True, device_map="auto", torch_dtype="auto", attn_implementation="eager")
elif model_id=="decilm_7b":
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True, device_map="auto", torch_dtype="auto", attn_implementation="flash_attention_2", trust_remote_code=True)
elif model_id=="openllama1_13b":
    # Chunking error during model conversion to safetensors
    model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto", torch_dtype="auto", attn_implementation="flash_attention_2")
elif model_id=="llama2_13b" or model_id=="solar_10b":
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True, load_in_8bit=True, device_map="auto", torch_dtype="auto", attn_implementation="flash_attention_2")
else:
    model = AutoModelForCausalLM.from_pretrained(model_name, use_safetensors=True, device_map="auto", torch_dtype="auto", attn_implementation="flash_attention_2")
    
if model_id=="bloomz_7b":
    tokenizer.model_max_length = model.config.seq_length
elif model_id=="mpt_7b":
    pass
else:
    # IMPORTANT fix: https://github.com/huggingface/transformers/issues/16186
    tokenizer.model_max_length = int(min(tokenizer.model_max_length, model.config.max_position_embeddings))

if model_id=="qwen_7b":
    print(f"- model vocabulary: {tokenizer.vocab_size}")
else:
    print(f"- model vocabulary: {len(tokenizer.vocab)}")

# Memory limit of RTX 4090
if tokenizer.model_max_length>8192:
    tokenizer.model_max_length = 8192
elif model_id=="decilm_7b":
    tokenizer.model_max_length = 4096
print(f"- model sequence length: {int(tokenizer.model_max_length)}")

print(f"- model torch dtype: {model.dtype}")



Computing perplexity on dataset frenchtext/bank-en-2401:valid for TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T
- model vocabulary: 32000
- model sequence length: 2048
- model torch dtype: torch.float16


## Unigram-normalized perplexity

https://arxiv.org/pdf/2011.13220.pdf

Unigram-Normalized Perplexity as a Language Model Performance Measure with Different Vocabulary Sizes

*Jihyeon Roh, Sang-Hoon Oh, Soo-Young Lee*

Although Perplexity is a widely used performance metric for language models, the values are highly dependent upon the number of words in the corpus and is useful to compare performance of the same corpus only.

Perplexity may not be suitable for comparing LMs using different vocabularies because a larger vocabulary size tends to result in lower word probabilities and thus a higher Perplexity.

In this paper, we propose a new metric that can be used to evaluate language model performance with different vocabulary sizes. 

The proposed unigram-normalized Perplexity actually presents the performance improvement of the language models from that of simple unigram model, and is robust on the vocabulary size.

To overcome the limitations of the perplexity, we adopt the basic idea of normalizing the word probability with respect to a quantity containing the vocabulary size. 

We apply a unigram probability that is calculated from the word occurrence as a normalization factor for the perplexity. The unigram probability from the unigram LM is computed as Count(vk) / Count(all words), where Count(vk) is the number of occurrences of word vk in the corpus.

Our proposed metric is obtained by normalizing the perplexity with this unigram probability.

The proposed “Perplexity normalized with unigram” (PPLu) is defined as
PPLu = (Product for all words in sequence of : P(word | language model) / P(word | unigram))^1/length of sequence 

This metric shows the likelihood improvement of a context-dependent LM from unigram LM without the context information, and enables us to evaluate the effectiveness of an LM.

PPLu contains a unigram probability term, which allows PPLu to evaluate LMs more accurately than PPL does. Specifically, even if an LM fails to capture word relationships, it may achieve a good PPL by simply assigning high probabilities to words that frequently appear (e.g., unknown tokens). This case can be corrected with our PPLu, which considers the word frequencies via unigram probabilities.

Formula:

``` 
log(PPLu) = 1/length of sequence * Sum for all words in sequence( log(P(word | language model)) - log(P(word | unigram)))
          = Log(PPL) - 1/length of sequence * Sum for all words in sequence( log(P(word | unigram) )
```

In [12]:
import torch.nn.functional as F

class PPLu():
    
    def __init__(self, dataset_iterator, tokenizer, device):
        if hasattr(tokenizer,"vocab"):
            self.vocab_size = len(tokenizer.vocab)
        else:
            self.vocab_size = tokenizer.vocab_size
        dataset_token_id_counts = torch.zeros(self.vocab_size+1, dtype=torch.int64)
        dataset_tokens_count = 0
        
        for idx,dataset_batch in enumerate(dataset_iterator):
            encodings = tokenizer(text = dataset_batch["Text"], add_special_tokens=True, padding="longest", return_tensors="pt")
            
            # Padding tokens should be ignored: count them as token_id=vocabulary_size
            token_ids = encodings.input_ids*encodings.attention_mask + self.vocab_size*(1-encodings.attention_mask)
            
            token_id_counts = torch.bincount(token_ids.view(-1), minlength=self.vocab_size+1)
            tokens_count = encodings.attention_mask.sum()

            dataset_token_id_counts += token_id_counts
            dataset_tokens_count += tokens_count
            if idx%100==9: print(f"... {dataset_tokens_count:,} tokens")
        
        # Then discard the tokens count for token_id=vocabulary_size
        self.token_id_probs =  (dataset_token_id_counts[:-1] / dataset_tokens_count).unsqueeze(1).to(device)
        self.perplexity_loss = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction="none")
        print(f"Done: {dataset_tokens_count:,} tokens")

    def __call__(self, input_ids, attention_mask, output_logits):
        # Next-token prediction: shift prediction scores and input ids by one
        logits = output_logits[:, :-1, :].permute(0, 2, 1).contiguous()
        labels = input_ids[:, 1:].contiguous()
        labels_to_ignore = attention_mask[:, 1:]

        # Number of tokens predicted, ignoring padding tokens
        predicted_tokens_count = labels_to_ignore.sum(dim=1)
        
        # Cross entropy loss (ignore_index=-100)
        labels_for_crossentropy = labels*labels_to_ignore -100*(1-labels_to_ignore)
        batch_perplexity_losses = (1/predicted_tokens_count)*self.perplexity_loss(logits, labels_for_crossentropy).sum(1)
        
        # Unigram probability loss
        labels_probs = F.embedding(labels, self.token_id_probs).squeeze()
        # prob = 1 for padding tokens => log prob = 0, ignored in the sum below
        labels_probs = labels_probs*labels_to_ignore + (1-labels_to_ignore) 
        batch_unigram_losses = -(1/predicted_tokens_count)*torch.log(labels_probs).sum(dim=1)
        
        # Unigram-nomralized perplexities
        perplexities = torch.exp(batch_perplexity_losses)
        unigram_normalized_perplexities = torch.exp(batch_perplexity_losses - batch_unigram_losses)
        
        return predicted_tokens_count, batch_perplexity_losses, batch_unigram_losses, perplexities, unigram_normalized_perplexities

In [13]:
class NormalizedPerplexityLogger:
    def __init__(self, dataset_name, split, model_name):
        self.filename = f"{dataset_name.replace('/','_')}_{split}_{model_name.replace('/','_')}_pplu.csv"
        self.file = open(self.filename, 'w')
        
    def log_batch(self, ppl, pplu, uri, span):
        self.file.write(f"{ppl},{pplu},{uri},{span}\n")

In [None]:
pplu_loss = PPLu(get_dataset_batches(dataset), tokenizer, model.device)

In [None]:
if model_id=="tinyllama_1b" or model_id=="redpajama_3b" or model_id=="openllama2_3b":
    batch_size = 16
elif model_id=="redpajama_7b" :
    batch_size = 8
elif model_id=="stablelm_3b" or model_id=="phi2_3b" or model_id=="falcon_7b" or model_id=="mpt_7b"or model_id=="openllama1_13b":
    batch_size = 6
elif model_id=="btlm_3b" or model_id=="llama2_7b":
    batch_size = 4
elif model_id=="yi_6b" or model_id=="llama2_13b":
    batch_size = 3
elif model_id=="bloomz_7b" or model_id=="llama2_7b_32k" or model_id=="mistral_7b" or model_id=="qwen_7b" or model_id=="decilm_7b" or model_id=="solar_10b":
    batch_size = 2
stride = 256

print(f"- dataset examples: {len(dataset)}")
print(f"- batch_size={batch_size}, stride={stride}")

In [None]:
import math

logger = NormalizedPerplexityLogger(dataset_name, split, model_name)

def display_perplexities(pred_tokens_count, ppl_losses, unigram_losses):        
    pt_pred_tokens_count = torch.Tensor(pred_tokens_count)
    total_pred_tokens_count = pt_pred_tokens_count.sum().item()
    
    pt_ppl_losses = torch.Tensor(ppl_losses)
    pt_unigram_losses = torch.Tensor(unigram_losses)    
    pt_pplu_losses = pt_ppl_losses - pt_unigram_losses

    ppl = math.exp((pt_ppl_losses*pt_pred_tokens_count).sum().item() / total_pred_tokens_count)
    pplu = math.exp((pt_pplu_losses*pt_pred_tokens_count).sum().item() / total_pred_tokens_count)

    print(f"-> perplexity = {ppl:.3f}")
    print(f"-> unigram-normalized perplexity = {pplu*1000:.3f} (x1000)")
    
pred_tokens_count = [] 
ppl_losses = []   
unigram_losses = [] 
for idx,encodings_batch in enumerate(get_encodings_batches(tokenizer, dataset, batch_size=batch_size, stride=stride)):
    with torch.no_grad():
        # predict next token
        inputs = encodings_batch["input_ids"].to(model.device)
        attention_mask = encodings_batch["attention_mask"].to(model.device)
        outputs = model(input_ids=inputs, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)

        batch_pred_tokens_count, batch_ppl_losses, batch_unigram_losses, batch_ppl, batch_pplu = pplu_loss(inputs, attention_mask, outputs.logits)
        
        pred_tokens_count.extend(batch_pred_tokens_count.tolist())
        ppl_losses.extend(batch_ppl_losses.tolist())
        unigram_losses.extend(batch_unigram_losses.tolist())

    for ppl,pplu,uri,span in zip(batch_ppl.tolist(), batch_pplu.tolist(), encodings_batch["overflow_to_sample_uri"], encodings_batch["overflow_to_sample_offset"]):
        logger.log_batch(ppl, pplu, uri, span)

    if idx%10 == 0:
        print(f"{(idx+1)*batch_size} encodings processed")
        display_perplexities(pred_tokens_count, ppl_losses, unigram_losses)

print(f"FINAL RESULT: {(idx+1)*batch_size} encodings processed")
display_perplexities(pred_tokens_count, ppl_losses, unigram_losses)

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T
- model vocabulary: 32000
- model sequence length: 2048
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size= 16, stride=256
- perplexity = 6.196
- unigram-normalized perplexity = 7.739 (x1000)

7 min 52 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for togethercomputer/RedPajama-INCITE-Base-3B-v1
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
- model vocabulary: 50277
- model sequence length: 2048
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=16, stride=256
- perplexity = 6.197
- unigram-normalized perplexity = 5.757 (x1000)

13 min 52 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for cerebras/btlm-3b-8k-base
- model vocabulary: 50257
- model sequence length: 4096 (8192 supported but too big for the RTX 4090)
- model torch dtype: torch.bfloat16
- dataset examples: 8522
- batch_size=4, stride=256
- perplexity = 7.333
- unigram-normalized perplexity = 9.866 (x1000)

42 min 56 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for openlm-research/open_llama_3b_v2
- model vocabulary: 32000
- model sequence length: 2048
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=16, stride=256
- 16,584,523 tokens in 36 sec
- perplexity = 4.762
- unigram-normalized perplexity = 7.148 (x1000)

25 min 00 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for stabilityai/stablelm-3b-4e1t
- model vocabulary: 50277
- model sequence length: 4096
- model torch dtype: torch.bfloat16
- dataset examples: 8522
- batch_size=6, stride=256
- 14,248,418 tokens in 16 sec
- perplexity = 4.950
- unigram-normalized perplexity = 4.590 (x1000)

14 min 36 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for microsoft/phi-2
- model vocabulary: 50295
- model sequence length: 2048
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=6, stride=256
- 16,122,587 tokens in 18 sec
- perplexity = 8.083
- unigram-normalized perplexity = 10.807 (x1000)

45 min 16 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for bigscience/bloomz-7b1-mt
- model vocabulary: 250680
- model sequence length: 2048
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=2, stride=256
- 10,041,864 tokens in 6 sec
- perplexity = 10.038
- unigram-normalized perplexity = 4.591 (x1000)

58 min 52 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for tiiuae/falcon-7b
- model vocabulary: 65024
- model sequence length: 2048
- model torch dtype: torch.bfloat16
- dataset examples: 8522
- batch_size=6, stride=256
- 13,622,432 tokens in 11 sec
- perplexity = 4.335
- unigram-normalized perplexity = 4.660 (x1000)

29 min 43 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for togethercomputer/RedPajama-INCITE-7B-Base
- model vocabulary: 50277
- model sequence length: 2048
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=8, stride=256
- 14,248,418 tokens in 13 sec
- perplexity = 5.512
- unigram-normalized perplexity = 5.120 (x1000)

29 min 31 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for mosaicml/mpt-7b
- model vocabulary: 50277
- model sequence length: 2048
- model torch dtype: torch.bfloat16
- dataset examples: 8522
- batch_size=6, stride=256
- 14,248,418 tokens in 8 sec
- perplexity = 5.581
- unigram-normalized perplexity = 5.184 (x1000)

51 min 52 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for meta-llama/Llama-2-7b-hf
- model vocabulary: 32000
- model sequence length: 4096
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=4, stride=256
- 15,042,809 tokens in 47 sec
- perplexity = 4.236
- unigram-normalized perplexity = 5.289 (x1000)

36 min 24 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for togethercomputer/LLaMA-2-7B-32K
- model vocabulary: 32000
- model sequence length: 8192 (32768 supported but too large)
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=2, stride=256
- 15,034,641 tokens in 43 sec
- perplexity = 4.409
- unigram-normalized perplexity = 5.504 (x1000)

36 min 29 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for mistralai/Mistral-7B-v0.1
- model vocabulary: 32000
- model sequence length: 8192 (32768 supported but too large)
- model torch dtype: torch.bfloat16
- dataset examples: 8522
- batch_size=2, stride=256
- 15,453,930 tokens in 39 sec
- perplexity = 3.803
- unigram-normalized perplexity = 4.955 (x1000)

40 min 27 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for Qwen/Qwen-7B
- model vocabulary: 151851
- model sequence length: 32768
- model torch dtype: torch.bfloat16
- dataset examples: 8522
- batch_size=2, stride=256
- 13,057,768 tokens in 11 sec

ERROR - Could not resolve the error in tiktoken fast tokenizer:
Unable to create tensor returning overflowing tokens of different lengths. Please see if a fast version of this tokenizer is available to have this feature available.

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for 01-ai/Yi-6B
- model vocabulary: 64000
- model sequence length: 4096
- model torch dtype: torch.bfloat16
- dataset examples: 8522
- batch_size=3, stride=256
- 16,978,029 tokens in 38 sec
- perplexity = 4.108
- unigram-normalized perplexity = 6.828 (x1000)

37 min 2 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for Deci/DeciLM-7B
- model vocabulary: 32000
- model sequence length: 4096
- model torch dtype: torch.bfloat16
- dataset examples: 8522
- batch_size=2, stride=256
- 15,453,930 tokens in 34 sec
- perplexity = 5.827
- unigram-normalized perplexity = 7.595 (x1000)

46 min 25 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for openlm-research/open_llama_13b
- **load_in_8bit=True**
- model vocabulary: 32000
- model sequence length: 2048
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=6, stride=256
- 16,739,789 tokens in 35 sec
- perplexity = 4.236
- unigram-normalized perplexity = 6.540 (x1000)

1 h 4 min

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for meta-llama/Llama-2-13b-hf
- **load_in_8bit=True**
- model vocabulary: 32000
- model sequence length: 4096
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=3, stride=256
- 15,042,809 tokens in 35 sec
- perplexity = 3.923
- unigram-normalized perplexity = 4.898 (x1000)

59 min 19 sec

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for upstage/SOLAR-10.7B-v1.0
- **load_in_8bit=True**
- model vocabulary: 32000
- model sequence length: 4096
- model torch dtype: torch.float16
- dataset examples: 8522
- batch_size=2, stride=256
- 15,453,930 tokens in 36 sec
- perplexity = 4.056
- unigram-normalized perplexity = 5.286 (x1000)

1h 29 min

Computing perplexity on dataset frenchtext/bank-en-2401:valid for TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T
- model vocabulary: 32000
- model sequence length: 2048
- model torch dtype: torch.float16

