# Most popular open weights LLMs - December 2023

## Dependencies

In [None]:
pip install datasets

## Models dictionary

In [1]:
models = { 
    "redpajama_3b" : "togethercomputer/RedPajama-INCITE-Base-3B-v1", # 5.30 GB
    "btlm_3b" : "cerebras/btlm-3b-8k-base", #  4.93 GB
    "openllama2_3b" : "openlm-research/open_llama_3b_v2", #  6.38 GB
    "stablelm_3b" : "stabilityai/stablelm-3b-4e1t", # 5.21 GB
    "phi2_3b" : "microsoft/phi-2", # 5.18 GB

    "bloomz_7b" : "bigscience/bloomz-7b1-mt", # 13.18 GB
    "falcon_7b" : "tiiuae/falcon-7b", # 13.45 GB       
    "redpajama_7b" : "togethercomputer/RedPajama-INCITE-7B-Base", # 12.90 GB
    "mpt_7b" : "mosaicml/mpt-7b", # 12.39 GB
    "mpt_7b_8k" : "mosaicml/mpt-7b-8k", # 12.39 GB
    "openllama2_7b" : "openlm-research/open_llama_7b_v2", # 12.55 GB
    "llama2_7b" : "meta-llama/Llama-2-7b-hf", # 12.55 GB
    "llama2_7b_32k" : "togethercomputer/LLaMA-2-7B-32K", # 12.55 GB
    "mistral_7b" : "mistralai/Mistral-7B-v0.1", # 13.49 GB
    "qwen_7b" : "Qwen/Qwen-7B", # 14.38 GB
    "yi_6b" : "01-ai/Yi-6B", # 11.29 GB
    "decilm_7b" : "Deci/DeciLM-7B", # 13.12 GB
    
    "openllama1_13b" : "openlm-research/open_llama_13b", # 24.24 GB
    "llama2_13b" : "meta-llama/Llama-2-13b-hf", # 24.25 GB
    "qwen_14b" : "Qwen/Qwen-14B", # 26.39 GB
    "solar_10b" : "upstage/SOLAR-10.7B-v1.0", # 19.99 GB
    
    "llama1_33b" : "TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ", # 15.78 GB https://huggingface.co/alexl83/LLaMA-33B-HF
    "falcon_40b" : "TheBloke/falcon-40b-instruct-GPTQ", # 21.00 GB https://huggingface.co/tiiuae/falcon-40b
    "mpt_30b" : "abhinavkulkarni/mosaicml-mpt-30b-instruct-w4-g128-awq", # 15.00 GB https://huggingface.co/mosaicml/mpt-30b
    "codellama_34b" : "TheBloke/CodeLlama-34B-Instruct-GPTQ", # 17.07 GB https://huggingface.co/codellama/CodeLlama-34b-hf
    "yi_34b" : "TheBloke/Yi-34B-GPTQ", # 17.33 GB https://huggingface.co/01-ai/Yi-34B    
    "mixtral_8x7B" : "TheBloke/Mixtral-8x7B-v0.1-GPTQ" # 22.18 GB https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
}

## Datasets

In [2]:
from datasets import load_dataset

In [3]:
dataset_name_fr = "frenchtext/banque-fr-2311"
dataset_fr = load_dataset(dataset_name_fr)

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

In [4]:
dataset_fr

DatasetDict({
    train: Dataset({
        features: ['Uri', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars', 'Website', 'PDF'],
        num_rows: 68166
    })
    valid: Dataset({
        features: ['Uri', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars', 'Website', 'PDF'],
        num_rows: 8522
    })
    test: Dataset({
        features: ['Uri', 'Timestamp', 'Lang', 'Title', 'Text', 'Words', 'AvgWordsLength', 'Chars', 'LetterChars', 'NumberChars', 'OtherChars', 'Website', 'PDF'],
        num_rows: 8541
    })
})

## Batching and tokenization

In [5]:
dataset_name = dataset_name_fr
split = "valid"
dataset = dataset_fr[split]

In [6]:
def get_dataset_batches(dataset, batch_size=32):
    filtered_dataset = dataset.filter(lambda example: example["Words"]>10)
    sorted_dataset = dataset.sort("Words",reverse=True)
    
    dataset_length = len(sorted_dataset)
    for start_idx in range(0, dataset_length, batch_size):
        end_idx = min(start_idx + batch_size, dataset_length)
        yield sorted_dataset[start_idx:end_idx]

In [7]:
def get_encoding_offsets(encoding):
    start_index = encoding.offsets[0][0]
    end_index = encoding.offsets[-1][1]
    if end_index==0: end_index = -1
    return (start_index, end_index)

In [8]:
def encode_dataset_batch(tokenizer, dataset_batch, stride=256):
    encodings = tokenizer(text = dataset_batch["Text"], add_special_tokens=True, 
                      padding="longest", truncation=True, return_overflowing_tokens=True, stride=stride,
                      # 2020: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#tensor-core-shape
                      # However now in 2023, this is less and less true, newer drivers and cuda versions are smarter about this and will be able to use tensorcores even without this aligned padding
                      pad_to_multiple_of=16, return_tensors="pt")

    encodings["overflow_to_sample_uri"] = list(map(lambda sample_id: dataset_batch["Uri"][sample_id.item()], encodings["overflow_to_sample_mapping"]))
    encodings["overflow_to_sample_offset"] = list(map(get_encoding_offsets, encodings.encodings))
    
    return encodings

In [9]:
def get_encodings_batches(tokenizer, dataset, batch_size=32, stride=256):
    for dataset_batch in get_dataset_batches(dataset, batch_size):
        encodings = encode_dataset_batch(tokenizer, dataset_batch, stride)
        
        encodings_length = len(encodings.encodings)
        for start_idx in range(0, encodings_length, batch_size):
            end_idx = min(start_idx + batch_size, encodings_length)
            yield {key: encodings[key][start_idx:end_idx] for key in encodings.data.keys()}

## Compute perplexity

In [10]:
with open("/workspace/hftoken", 'r') as file:
    myhftoken = file.read().strip()

In [11]:
class PerplexityLogger:
    def __init__(self, dataset_name, split, model_name):
        self.filename = f"{dataset_name.replace('/','_')}_{split}_{model_name.replace('/','_')}_perplexity.csv"
        self.file = open(self.filename, 'w')
        
    def log_batch(self, perplexity, uri, span):
        self.file.write(f"{perplexity},{uri},{span}\n")

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

batch_size = 4
stride = 256

model_id = list(models)[16]
model_name = models[model_id]
print(f"Computing perplexity on dataset {dataset_name}:{split} for {model_name}")
print(f"- dataset examples: {len(dataset)}")
print(f"- batch_size= {batch_size}, stride={stride}")

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", attn_implementation="flash_attention_2", trust_remote_code=True, token=myhftoken) 
print(f"- model torch dtype: {model.dtype}")
print(f"- model vocabulary: {len(tokenizer.vocab)}")
print(f"- model sequence length: {int(tokenizer.model_max_length)}")



Computing perplexity on dataset frenchtext/banque-fr-2311:valid for Deci/DeciLM-7B
- dataset examples: 8522
- batch_size= 4, stride=256


A new version of the following files was downloaded from https://huggingface.co/Deci/DeciLM-7B:
- configuration_decilm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Deci/DeciLM-7B:
- modeling_decilm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

- model torch dtype: torch.bfloat16
- model vocabulary: 32000
- model sequence length: 1000000000000000019884624838656


In [13]:
tokenizer.model_max_length = 8192
batch_size=1

In [None]:
logger = PerplexityLogger(dataset_name, split, model_name)
loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
losses = []    
for idx,encodings_batch in enumerate(get_encodings_batches(tokenizer, dataset, batch_size=batch_size, stride=stride)):
    with torch.no_grad():
        # predict next token
        inputs = encodings_batch["input_ids"].to(model.device)
        attention_mask = encodings_batch["attention_mask"].to(model.device)
        outputs = model(input_ids=inputs, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)

        # compute perplexity
        # we are doing next-token prediction; shift prediction scores and input ids by one
        shift_logits = outputs.logits[:, :-1, :].permute(0, 2, 1).contiguous()
        labels = inputs[:, 1:].contiguous()
        labels_to_ignore = attention_mask[:, 1:]
        # CrossEntropyLoss: ignore_index=-100
        labels = labels*labels_to_ignore -100*(1-labels_to_ignore)
        batch_losses = loss_fct(shift_logits, labels).mean(1)
        losses.extend(batch_losses)
        batch_perplexities = torch.exp(batch_losses).tolist()

    for perplexity,uri,span in zip(batch_perplexities, encodings_batch["overflow_to_sample_uri"], encodings_batch["overflow_to_sample_offset"]):
        logger.log_batch(perplexity, uri, span)

    if idx%10 == 0:
        perplexity = torch.exp(torch.stack(losses).mean().float()).item()
        print(f"{(idx+1)*batch_size} encodings processed -> perplexity = {perplexity}")

perplexity = torch.exp(torch.stack(losses).mean().float()).item()
print(f"-> perplexity = {perplexity}")

1 encodings processed -> perplexity = 5.994564056396484
11 encodings processed -> perplexity = 6.34369421005249
21 encodings processed -> perplexity = 6.025722980499268
31 encodings processed -> perplexity = 6.182291030883789
41 encodings processed -> perplexity = 5.930436134338379
51 encodings processed -> perplexity = 5.753510475158691
61 encodings processed -> perplexity = 5.713020324707031
71 encodings processed -> perplexity = 5.6975531578063965
81 encodings processed -> perplexity = 5.545077800750732
91 encodings processed -> perplexity = 5.193302154541016
101 encodings processed -> perplexity = 5.004990100860596
111 encodings processed -> perplexity = 4.910062313079834
121 encodings processed -> perplexity = 4.870457172393799
131 encodings processed -> perplexity = 4.895934581756592
141 encodings processed -> perplexity = 4.754809856414795
151 encodings processed -> perplexity = 4.803183555603027
161 encodings processed -> perplexity = 4.777984619140625
171 encodings processed -

Computing perplexity on dataset frenchtext/banque-fr-2311 for togethercomputer/RedPajama-INCITE-Base-3B-v1
- dataset examples: 68166
- batch_size= 16, stride=256
- model torch dtype: torch.float16
- model vocabulary: 50277
- model sequence length: 2048
- perplexity = 5.301388263702393 (train)
- perplexity = 5.480365753173828 (valid) [+3,4%]

Computing perplexity on dataset frenchtext/banque-fr-2311 for openlm-research/open_llama_3b_v2
- dataset examples: 68166
- batch_size= 12, stride=256
- model torch dtype: torch.float16
- model vocabulary: 32000
- model sequence length: 2048
- perplexity = 4.064583778381348 (train)
- perplexity = 3.9680004119873047 (valid) [-2,3%]

Computing perplexity on dataset frenchtext/banque-fr-2311 for togethercomputer/RedPajama-INCITE-7B-Base
- dataset examples: 68166
- batch_size= 8, stride=256
- model torch dtype: torch.float16
- model vocabulary: 50277
- model sequence length: 2048
- perplexity = 4.955935478210449

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for mistralai/Mistral-7B-v0.1
- dataset examples: 8522
- batch_size= 4, stride=256
- model torch dtype: torch.bfloat16
- model vocabulary: 32000
- model sequence length: 4096
- perplexity = 3.9531056880950928 (valid)

Computing perplexity on dataset frenchtext/banque-fr-2311:valid for 01-ai/Yi-6B
- dataset examples: 8522
- batch_size= 4, stride=256
- model torch dtype: torch.bfloat16
- model vocabulary: 64000
- model sequence length: 4096
- perplexity = 3.990814685821533 (valid)