# Most popular open weights LLMs - December 2023

## Dependencies

In [None]:
pip install datasets

## Models dictionary

In [1]:
models = { 
    "redpajama_3b" : "togethercomputer/RedPajama-INCITE-Base-3B-v1", # 5.30 GB
    "btlm_3b" : "cerebras/btlm-3b-8k-base", #  4.93 GB
    "openllama2_3b" : "openlm-research/open_llama_3b_v2", #  6.38 GB
    "stablelm_3b" : "stabilityai/stablelm-3b-4e1t", # 5.21 GB
    "phi2_3b" : "microsoft/phi-2", # 5.18 GB

    "bloomz_7b" : "bigscience/bloomz-7b1-mt", # 13.18 GB
    "falcon_7b" : "tiiuae/falcon-7b", # 13.45 GB       
    "redpajama_7b" : "togethercomputer/RedPajama-INCITE-7B-Base", # 12.90 GB
    "mpt_7b" : "mosaicml/mpt-7b", # 12.39 GB
    "mpt_7b_8k" : "mosaicml/mpt-7b-8k", # 12.39 GB
    "openllama2_7b" : "openlm-research/open_llama_7b_v2", # 12.55 GB
    "llama2_7b" : "meta-llama/Llama-2-7b-hf", # 12.55 GB
    "llama2_7b_32k" : "togethercomputer/LLaMA-2-7B-32K", # 12.55 GB
    "mistral_7b" : "mistralai/Mistral-7B-v0.1", # 13.49 GB
    "qwen_7b" : "Qwen/Qwen-7B", # 14.38 GB
    "yi_6b" : "01-ai/Yi-6B", # 11.29 GB
    "decilm_7b" : "Deci/DeciLM-7B", # 13.12 GB
    
    "openllama1_13b" : "openlm-research/open_llama_13b", # 24.24 GB
    "llama2_13b" : "meta-llama/Llama-2-13b-hf", # 24.25 GB
    "qwen_14b" : "Qwen/Qwen-14B", # 26.39 GB
    "solar_10b" : "upstage/SOLAR-10.7B-v1.0", # 19.99 GB
    
    "llama1_33b" : "TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ", # 15.78 GB https://huggingface.co/alexl83/LLaMA-33B-HF
    "falcon_40b" : "TheBloke/falcon-40b-instruct-GPTQ", # 21.00 GB https://huggingface.co/tiiuae/falcon-40b
    "mpt_30b" : "abhinavkulkarni/mosaicml-mpt-30b-instruct-w4-g128-awq", # 15.00 GB https://huggingface.co/mosaicml/mpt-30b
    "codellama_34b" : "TheBloke/CodeLlama-34B-Instruct-GPTQ", # 17.07 GB https://huggingface.co/codellama/CodeLlama-34b-hf
    "yi_34b" : "TheBloke/Yi-34B-GPTQ", # 17.33 GB https://huggingface.co/01-ai/Yi-34B    
    "mixtral_8x7B" : "TheBloke/Mixtral-8x7B-v0.1-GPTQ" # 22.18 GB https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
}

## Datasets

In [2]:
from datasets import load_dataset

In [3]:
dataset_name_fr = "frenchtext/banque-fr-2311"
dataset_fr = load_dataset(dataset_name_fr)

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

In [4]:
dataset_fr

DatasetDict({
    train: Dataset({
        features: ['Uri', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars', 'Website', 'PDF'],
        num_rows: 68166
    })
    valid: Dataset({
        features: ['Uri', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars', 'Website', 'PDF'],
        num_rows: 8522
    })
    test: Dataset({
        features: ['Uri', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars', 'Website', 'PDF'],
        num_rows: 8541
    })
})

## Batching and tokenization

In [5]:
dataset_name = dataset_name_fr
split = "valid"
dataset = dataset_fr[split]

In [6]:
def get_dataset_batches(dataset, batch_size=32):
    filtered_dataset = dataset.filter(lambda example: example["Words"]>10)
    sorted_dataset = dataset.sort("Words",reverse=True)
    
    dataset_length = len(sorted_dataset)
    for start_idx in range(0, dataset_length, batch_size):
        end_idx = min(start_idx + batch_size, dataset_length)
        yield sorted_dataset[start_idx:end_idx]

In [7]:
def get_encoding_offsets(encoding):
    start_index = encoding.offsets[0][0]
    end_index = encoding.offsets[-1][1]
    if end_index==0: end_index = -1
    return (start_index, end_index)

In [8]:
def encode_dataset_batch(tokenizer, dataset_batch, stride=256):
    encodings = tokenizer(text = dataset_batch["Text"], add_special_tokens=True, 
                      padding="longest", truncation=True, return_overflowing_tokens=True, stride=stride,
                      # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                      # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                      pad_to_multiple_of=16, return_tensors="pt")

    encodings["overflow_to_sample_uri"] = list(map(lambda sample_id: dataset_batch["Uri"][sample_id.item()], encodings["overflow_to_sample_mapping"]))
    encodings["overflow_to_sample_offset"] = list(map(get_encoding_offsets, encodings.encodings))
    
    return encodings

In [9]:
def get_encodings_batches(tokenizer, dataset, batch_size=32, stride=256):
    for dataset_batch in get_dataset_batches(dataset, batch_size):
        encodings = encode_dataset_batch(tokenizer, dataset_batch, stride)
        
        encodings_length = len(encodings.encodings)
        for start_idx in range(0, encodings_length, batch_size):
            end_idx = min(start_idx + batch_size, encodings_length)
            yield {key: encodings[key][start_idx:end_idx] for key in encodings.data.keys()}

## Compute perplexity

In [10]:
with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

In [11]:
class PerplexityLogger:
    def __init__(self, dataset_name, split, model_name):
        self.filename = f"{dataset_name.replace('/','_')}_{split}_{model_name.replace('/','_')}_perplexity.csv"
        self.file = open(self.filename, 'w')
        
    def log_batch(self, perplexity, uri, span):
        self.file.write(f"{perplexity},{uri},{span}\n")

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

batch_size = 8
stride = 256

model_id = list(models)[6]
model_name = models[model_id]
print(f"Computing perplexity on dataset {dataset_name}:{split} for {model_name}")
print(f"- dataset examples: {len(dataset)}")
print(f"- batch_size= {batch_size}, stride={stride}")

tokenizer = AutoTokenizer.from_pretrained(model_name)#, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", attn_implementation="flash_attention_2")#, trust_remote_code=True, token=myhftoken) 
print(f"- model torch dtype: {model.dtype}")
print(f"- model vocabulary: {len(tokenizer.vocab)}")
print(f"- model sequence length: {int(tokenizer.model_max_length)}")



Computing perplexity on dataset frenchtext/banque-fr-2311:valid for tiiuae/falcon-7b
- dataset examples: 8522
- batch_size= 8, stride=256


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

- model torch dtype: torch.bfloat16
- model vocabulary: 65024
- model sequence length: 2048


In [None]:
logger = PerplexityLogger(dataset_name, split, model_name)
loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
losses = []    
for idx,encodings_batch in enumerate(get_encodings_batches(tokenizer, dataset, batch_size=batch_size, stride=stride)):
    with torch.no_grad():
        # predict next token
        inputs = encodings_batch["input_ids"].to(model.device)
        attention_mask = encodings_batch["attention_mask"].to(model.device)
        outputs = model(input_ids=inputs, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)

        # compute perplexity
        # we are doing next-token prediction; shift prediction µscores and input ids by one
        shift_logits = outputs.logits[:, :-1, :].permute(0, 2, 1).contiguous()
        labels = inputs[:, 1:].contiguous()
        labels_to_ignore = attention_mask[:, 1:]
        # CrossEntropyLoss: ignore_index=-100
        labels = labels*labels_to_ignore -100*(1-labels_to_ignore)
        batch_losses = loss_fct(shift_logits, labels).mean(1)
        losses.extend(batch_losses)
        batch_perplexities = torch.exp(batch_losses).tolist()

    for perplexity,uri,span in zip(batch_perplexities, encodings_batch["overflow_to_sample_uri"], encodings_batch["overflow_to_sample_offset"]):
        logger.log_batch(perplexity, uri, span)

    if idx%10 == 0:
        perplexity = torch.exp(torch.stack(losses).mean().float()).item()
        print(f"{(idx+1)*batch_size} encodings processed -> perplexity = {perplexity}")

perplexity = torch.exp(torch.stack(losses).mean().float()).item()
print(f"-> perplexity = {perplexity}")

Computing perplexity on dataset frenchtext/banque-fr-2311 for togethercomputer/RedPajama-INCITE-Base-3B-v1
- dataset examples: 68166
- batch_size= 16, stride=256
- model torch dtype: torch.float16
- model vocabulary: 50277
- model sequence length: 2048
- perplexity = 5.301388263702393 (train)
- perplexity = 5.480365753173828 (valid) [+3,4%]

Computing perplexity on dataset frenchtext/banque-fr-2311 for openlm-research/open_llama_3b_v2
- dataset examples: 68166
- batch_size= 12, stride=256
- model torch dtype: torch.float16
- model vocabulary: 32000
- model sequence length: 2048
- perplexity = 4.064583778381348 (train)
- perplexity = 3.9680004119873047 (valid) [-2,3%]

Computing perplexity on dataset frenchtext/banque-fr-2311 for togethercomputer/RedPajama-INCITE-7B-Base
- dataset examples: 68166
- batch_size= 8, stride=256
- model torch dtype: torch.float16
- model vocabulary: 50277
- model sequence length: 2048
- perplexity = 4.955935478210449

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for mistralai/Mistral-7B-v0.1
- dataset examples: 8522
- batch_size= 4, stride=256
- model torch dtype: torch.bfloat16
- model vocabulary: 32000
- model sequence length: 4096
- perplexity = 3.9531056880950928 (valid)

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for 01-ai/Yi-6B
- dataset examples: 8522
- batch_size= 4, stride=256
- model torch dtype: torch.bfloat16
- model vocabulary: 64000
- model sequence length: 4096
- perplexity = 3.990814685821533 (valid)

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for tiiuae/falcon-7b
- dataset examples: 8522
- batch_size= 8, stride=256
- model torch dtype: torch.bfloat16
- model vocabulary: 65024
- model sequence length: 2048
- perplexity = 3.8035600185394287 (valid)

## Unigram-normalized perplexity

https://arxiv.org/pdf/2011.13220.pdf

Unigram-Normalized Perplexity as a Language Model Performance Measure with Different Vocabulary Sizes

*Jihyeon Roh, Sang-Hoon Oh, Soo-Young Lee*

Although Perplexity is a widely used performance metric for language models, the values are highly dependent upon the number of words in the corpus and is useful to compare performance of the same corpus only.

Perplexity may not be suitable for comparing LMs using different vocabularies because a larger vocabulary size tends to result in lower word probabilities and thus a higher Perplexity.

In this paper, we propose a new metric that can be used to evaluate language model performance with different vocabulary sizes. 

The proposed unigram-normalized Perplexity actually presents the performance improvement of the language models from that of simple unigram model, and is robust on the vocabulary size.

To overcome the limitations of the perplexity, we adopt the basic idea of normalizing the word probability with respect to a quantity containing the vocabulary size. 

We apply a unigram probability that is calculated from the word occurrence as a normalization factor for the perplexity. The unigram probability from the unigram LM is computed as Count(vk) / Count(all words), where Count(vk) is the number of occurrences of word vk in the corpus.

Our proposed metric is obtained by normalizing the perplexity with this unigram probability.

The proposed “Perplexity normalized with unigram” (PPLu) is defined as
PPLu = (Product for all words in sequence of : P(word | language model) / P(word | unigram))^1/length of sequence 

This metric shows the likelihood improvement of a context-dependent LM from unigram LM without the context information, and enables us to evaluate the effectiveness of an LM.

PPLu contains a unigram probability term, which allows PPLu to evaluate LMs more accurately than PPL does. Specifically, even if an LM fails to capture word relationships, it may achieve a good PPL by simply assigning high probabilities to words that frequently appear (e.g., unknown tokens). This case can be corrected with our PPLu, which considers the word frequencies via unigram probabilities.

Formula:

``` 
log(PPLu) = 1/length of sequence * Sum for all words in sequence( log(P(word | language model)) - log(P(word | unigram)))
          = Log(PPL) - 1/length of sequence * Sum for all words in sequence( log(P(word | unigram) )
```

In [16]:
encodings_batch = next(get_encodings_batches(tokenizer, dataset, batch_size=batch_size, stride=stride))
encodings_batch.keys()

dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping', 'overflow_to_sample_uri', 'overflow_to_sample_offset'])

In [22]:
encodings_batch["input_ids"].size(),encodings_batch["input_ids"][0]

(torch.Size([8, 2048]),
 tensor([   14,   382,  2992,  ...,  8409, 34268,  2263]))

In [17]:
loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
with torch.no_grad():
    # predict next token
    inputs = encodings_batch["input_ids"].to(model.device)
    attention_mask = encodings_batch["attention_mask"].to(model.device)
    outputs = model(input_ids=inputs, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)

outputs.logits.size()

torch.Size([8, 2048, 65024])

In [18]:
# compute perplexity
# we are doing next-token prediction; shift prediction scores and input ids by one
shift_logits = outputs.logits[:, :-1, :].permute(0, 2, 1).contiguous()
labels = inputs[:, 1:].contiguous()
labels_to_ignore = attention_mask[:, 1:]
# CrossEntropyLoss: ignore_index=-100
labels = labels*labels_to_ignore -100*(1-labels_to_ignore)
batch_losses = loss_fct(shift_logits, labels).mean(1)

batch_losses

tensor([1.5547, 1.7812, 1.6875, 1.6875, 1.6094, 1.4062, 1.4375, 1.6016],
       device='cuda:0', dtype=torch.bfloat16)

In [19]:
batch_perplexities = torch.exp(batch_losses).tolist()

batch_perplexities

[4.71875, 5.9375, 5.40625, 5.40625, 5.0, 4.09375, 4.21875, 4.96875]

In [26]:
# Compute unigram probabilities

dataset_batch = next(get_dataset_batches(dataset))

In [54]:
tokenizer.all_special_tokens

['<|endoftext|>',
 '>>TITLE<<',
 '>>ABSTRACT<<',
 '>>INTRODUCTION<<',
 '>>SUMMARY<<',
 '>>COMMENT<<',
 '>>ANSWER<<',
 '>>QUESTION<<',
 '>>DOMAIN<<',
 '>>PREFIX<<',
 '>>SUFFIX<<',
 '>>MIDDLE<<']

In [77]:
token_to_ignore = 0
tokenizer.pad_token = tokenizer.decode(token_to_ignore)
tokenizer.pad_token

'>>TITLE<<'

In [68]:
encodings = tokenizer(text = dataset_batch["Text"], add_special_tokens=True, padding="longest", return_tensors="pt")

In [69]:
token_ids = encodings.input_ids
token_ids.size(),token_ids

(torch.Size([32, 288564]),
 tensor([[   14,   382,  2992,  ...,   193,   195,   193],
         [   14, 12869, 18413,  ...,     0,     0,     0],
         [   14,  6419,  2454,  ...,     0,     0,     0],
         ...,
         [   57,    25,    52,  ...,     0,     0,     0],
         [34466, 18587,  8158,  ...,     0,     0,     0],
         [   14,  9449, 53037,  ...,     0,     0,     0]]))

In [75]:
tokens_count = encodings.attention_mask.sum()
tokens_count

tensor(2503395)

In [70]:
flattened_token_ids = token_ids.view(-1)
flattened_token_ids.size(),flattened_token_ids

(torch.Size([9234048]), tensor([  14,  382, 2992,  ...,    0,    0,    0]))

In [71]:
vocab_size = len(tokenizer.vocab)
token_counts = torch.zeros(vocab_size, dtype=torch.int64)

In [72]:
# Count the occurrences of each token ID in this batch
batch_counts = torch.bincount(flattened_token_ids, minlength=vocab_size)

# Add the counts from this batch to the total counts
token_counts += batch_counts

In [73]:
token_counts[:100]

tensor([6730653,       0,       0,       0,       0,       0,       0,       0,
              0,       0,       0,       0,      78,    3592,     161,      22,
           5528,     316,   13417,   17977,   13947,     246,     566,   54753,
          21514,   51442,    6787,    6689,    7851,    5767,    4073,    2831,
           2294,    2014,    1850,    2212,    1734,    6714,    4342,     157,
             81,     203,     372,      57,    1704,    1272,    1656,    1644,
           1203,    1402,     929,     378,    1150,     524,     175,    1353,
            905,     866,    2015,    1588,      67,    2018,    1989,    1237,
            578,     825,      48,     133,     143,      92,    1505,      69,
           1184,       4,      73,      37,    2013,    1130,    1607,    5055,
           4446,    1655,     722,     333,    2166,     201,      70,    3124,
           1909,    2127,     282,    1153,     101,    1207,    4233,    1928,
           2679,    1544,      12,     2

In [94]:
token_counts[token_to_ignore] = 0
vocab_probs = (token_counts / tokens_count).unsqueeze(1)

In [96]:
vocab_probs[:20]

tensor([[0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [3.1158e-05],
        [1.4349e-03],
        [6.4313e-05],
        [8.7881e-06],
        [2.2082e-03],
        [1.2623e-04],
        [5.3595e-03],
        [7.1810e-03]])

In [97]:
test_input = token_ids[:,:100]
test_input.size(),test_input

(torch.Size([32, 100]),
 tensor([[   14,   382,  2992,  ...,   204, 15244, 60445],
         [   14, 12869, 18413,  ...,   193,  2291, 19140],
         [   14,  6419,  2454,  ...,  2937,   204,    13],
         ...,
         [   57,    25,    52,  ...,   204, 50801,   204],
         [34466, 18587,  8158,  ...,   195,   193,   195],
         [   14,  9449, 53037,  ..., 20643, 28889, 32087]]))

In [100]:
import torch.nn.functional as F

probs = F.embedding(test_input, vocab_probs).squeeze()
probs.size(),probs

(torch.Size([32, 100]),
 tensor([[6.4313e-05, 8.8040e-04, 7.7095e-05,  ..., 3.6254e-02, 2.7563e-05,
          1.5978e-06],
         [6.4313e-05, 5.5924e-06, 1.1984e-05,  ..., 8.6842e-02, 1.1676e-03,
          4.7456e-04],
         [6.4313e-05, 1.2783e-05, 2.5685e-04,  ..., 5.1798e-03, 3.6254e-02,
          1.4349e-03],
         ...,
         [3.4593e-04, 2.0549e-02, 4.5938e-04,  ..., 3.6254e-02, 4.3940e-06,
          3.6254e-02],
         [2.7962e-06, 5.1929e-06, 3.2356e-05,  ..., 3.6540e-02, 8.6842e-02,
          3.6540e-02],
         [6.4313e-05, 9.7468e-05, 2.3169e-05,  ..., 4.1384e-04, 3.6470e-04,
          3.5951e-04]]))

In [102]:
torch.log(probs).sum(dim=1)

tensor([-726.7230, -726.9221, -662.6291, -767.7623, -720.5561, -628.8273,
        -644.9472, -684.7527, -708.9458, -712.5073, -633.9623, -659.5041,
        -706.5404, -651.2568, -740.7549, -722.7048, -685.3760, -647.0948,
        -727.0426, -686.0406, -757.7579, -727.1324, -686.3970, -659.1260,
        -629.7627, -737.1560, -671.4485, -735.9675, -663.1301, -716.1690,
        -675.1099, -727.1324])